Add CT3531 Database Systems II
This commit is contained in:
Binary file not shown.
Binary file not shown.
@ -0,0 +1,82 @@
|
|||||||
|
CREATE TABLE teams (
|
||||||
|
team_name VARCHAR(255) NOT NULL, -- assuming team name is unique
|
||||||
|
home_venue VARCHAR(255) NOT NULL, -- assuming that two teams could share a home venue (e.g, A teams and B teams)
|
||||||
|
manager VARCHAR(255), -- assuming no other information is known/required for managers other than name
|
||||||
|
|
||||||
|
PRIMARY KEY (team_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE players (
|
||||||
|
player_id INT NOT NULL AUTO_INCREMENT,
|
||||||
|
team_name VARCHAR(255) NOT NULL,
|
||||||
|
squad_number INT NOT NULL, -- assuming squad number is unique within squads
|
||||||
|
player_name VARCHAR(255) NOT NULL,
|
||||||
|
|
||||||
|
PRIMARY KEY (player_id),
|
||||||
|
FOREIGN KEY (team_name) REFERENCES teams(team_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE games (
|
||||||
|
game_id INT NOT NULL AUTO_INCREMENT, -- no other uniquely identifying information about a game (there could be two otherwise identical games)
|
||||||
|
home_team VARCHAR(255) NOT NULL, -- venue can be inferred from this
|
||||||
|
away_team VARCHAR(255) NOT NULL,
|
||||||
|
path_to_heap_file VARCHAR(255),
|
||||||
|
|
||||||
|
PRIMARY KEY (game_id),
|
||||||
|
FOREIGN KEY (home_team) REFERENCES teams(team_name),
|
||||||
|
FOREIGN KEY (away_team) REFERENCES teams(team_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- this table should only be inserted into upon the completion of a game
|
||||||
|
CREATE TABLE results (
|
||||||
|
game_id INT NOT NULL,
|
||||||
|
winner VARCHAR(255), -- NULL value indicates a draw
|
||||||
|
|
||||||
|
PRIMARY KEY (game_id),
|
||||||
|
FOREIGN KEY (winner) REFERENCES teams(team_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- data redundancy here: players rarely change during match
|
||||||
|
CREATE TABLE active_players (
|
||||||
|
active_time TIMESTAMP NOT NULL,
|
||||||
|
game_id INT NOT NULL,
|
||||||
|
player_id INT NOT NULL,
|
||||||
|
|
||||||
|
PRIMARY KEY (active_time, player_id),
|
||||||
|
FOREIGN KEY (game_id) REFERENCES games(game_id),
|
||||||
|
FOREIGN KEY (player_id) REFERENCES players(player_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE substitutions (
|
||||||
|
substitution_time TIMESTAMP NOT NULL,
|
||||||
|
game_id INT NOT NULL,
|
||||||
|
off_player_id INT NOT NULL,
|
||||||
|
on_player_id INT NOT NULL,
|
||||||
|
|
||||||
|
PRIMARY KEY (substitution_time, off_player_id), -- assuming two substitutions could be done at the same time
|
||||||
|
FOREIGN KEY (game_id) REFERENCES games(game_id),
|
||||||
|
FOREIGN KEY (off_player_id) REFERENCES players(player_id),
|
||||||
|
FOREIGN KEY (on_player_id) REFERENCES players(player_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE sendoffs (
|
||||||
|
sendoff_time TIMESTAMP NOT NULL,
|
||||||
|
game_id INT NOT NULL,
|
||||||
|
player_id INT NOT NULL,
|
||||||
|
|
||||||
|
PRIMARY KEY (sendoff_time, player_id), -- assuming two sendoffs could be done at the same time - otherwise could just use time
|
||||||
|
FOREIGN KEY (game_id) REFERENCES games(game_id),
|
||||||
|
FOREIGN KEY (player_id) REFERENCES players(player_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE goals (
|
||||||
|
goal_time TIMESTAMP NOT NULL, -- assuming two goals can't be scored at the same time
|
||||||
|
game_id INT NOT NULL,
|
||||||
|
player_id INT NOT NULL, -- cam infer squad number from this
|
||||||
|
benefitting_team VARCHAR(255) NOT NULL, -- the team to which the points are awarded for this goal
|
||||||
|
|
||||||
|
PRIMARY KEY (goal_time, game_id),
|
||||||
|
FOREIGN KEY (game_id) REFERENCES games(game_id),
|
||||||
|
FOREIGN KEY (player_id) REFERENCES players(player_id),
|
||||||
|
FOREIGN KEY (benefitting_team) REFERENCES teams(team_name)
|
||||||
|
);
|
@ -0,0 +1,36 @@
|
|||||||
|
-- 1. List all players playing for a given team
|
||||||
|
-- SELECT * FROM active_players JOIN players ON active_players.player_id = players.player_id WHERE team_name = "<insert team name here>" AND time = CURRENT_TIMESTAMP;
|
||||||
|
SELECT player_name FROM players WHERE team_name = "man u";
|
||||||
|
|
||||||
|
-- 2. List all players who have scored in a given game
|
||||||
|
SELECT player_id FROM goals WHERE game_id = "<insert game ID here>";
|
||||||
|
SELECT player_name FROM goals JOIN players ON goals.player_id = players.player_id WHERE game_id = 1;
|
||||||
|
|
||||||
|
-- 3. List the top five goal scorers in the league
|
||||||
|
SELECT player_id, COUNT(*) as goals_scored FROM goals GROUP BY player_id ORDER BY goals_scored DESC LIMIT 5;
|
||||||
|
SELECT player_id, COUNT(*) as goals_scored FROM goals GROUP BY player_id ORDER BY goals_scored DESC LIMIT 5;
|
||||||
|
SELECT
|
||||||
|
player_name, COUNT(*) as goals_scored
|
||||||
|
FROM
|
||||||
|
goals INNER JOIN players ON goals.player_id = players.player_id
|
||||||
|
GROUP BY players.player_id
|
||||||
|
ORDER BY goals_scored DESC
|
||||||
|
LIMIT 5;
|
||||||
|
|
||||||
|
-- 4. List all teams and the amounts of points that they have so far
|
||||||
|
-- drawback: adding up each time, costly if this is a common query
|
||||||
|
SELECT
|
||||||
|
teams.team_name,
|
||||||
|
SUM(
|
||||||
|
CASE
|
||||||
|
WHEN results.winner = teams.team_name THEN 3
|
||||||
|
WHEN results.winner IS NULL THEN 1
|
||||||
|
ELSE 0
|
||||||
|
END
|
||||||
|
) AS total_points
|
||||||
|
FROM
|
||||||
|
teams
|
||||||
|
LEFT JOIN games ON teams.team_name = games.home_team OR teams.team_name = games.away_team
|
||||||
|
LEFT JOIN results ON games.game_id = results.game_id
|
||||||
|
GROUP BY
|
||||||
|
teams.team_name;
|
Binary file not shown.
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{determine\PYGZus{}grid}\PYG{p}{(}\PYG{n}{x}\PYG{p}{,}\PYG{n}{y}\PYG{p}{)}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,14 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\},codes={\catcode`\$=3\catcode`\^=7\catcode`\_=8\relax}]
|
||||||
|
\PYG{c+c1}{\PYGZsh{} input: a pair of (x,y) co-ordinates}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} output: a string that identifies the grid}
|
||||||
|
\PYG{k}{def} \PYG{n+nf}{determine\PYGZus{}grid}\PYG{p}{(}\PYG{n}{x}\PYG{p}{,} \PYG{n}{y}\PYG{p}{):}
|
||||||
|
\PYG{n}{grid\PYGZus{}width} \PYG{o}{=} \PYG{l+m+mi}{29} \PYG{c+c1}{\PYGZsh{} width of each grid in metres}
|
||||||
|
\PYG{n}{grid\PYGZus{}height} \PYG{o}{=} \PYG{l+m+mi}{15} \PYG{c+c1}{\PYGZsh{} height of each grid in metres}
|
||||||
|
|
||||||
|
\PYG{c+c1}{\PYGZsh{} determining the location of the grid using floor division}
|
||||||
|
\PYG{n}{grid\PYGZus{}x} \PYG{o}{=} \PYG{n}{x} \PYG{o}{//} \PYG{n}{grid\PYGZus{}width}
|
||||||
|
\PYG{n}{grid\PYGZus{}y} \PYG{o}{=} \PYG{n}{y} \PYG{o}{//} \PYG{n}{grid\PYGZus{}height}
|
||||||
|
|
||||||
|
\PYG{c+c1}{\PYGZsh{} return the grid identifier in the format `x.y`}
|
||||||
|
\PYG{k}{return} \PYG{n+nb}{str}\PYG{p}{(}\PYG{n}{grid\PYGZus{}x}\PYG{p}{)} \PYG{o}{+} \PYG{l+s+s2}{\PYGZdq{}.\PYGZdq{}} \PYG{o}{+} \PYG{n+nb}{str}\PYG{p}{(}\PYG{n}{grid\PYGZus{}y}\PYG{p}{)}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{determine\PYGZus{}grid}\PYG{p}{(}\PYG{n}{x}\PYG{p}{,} \PYG{n}{y}\PYG{p}{)}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{determine\PYGZus{}grid}\PYG{p}{()}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,29 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{c+c1}{\PYGZsh{} function to return the times when two given players are in the same location}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} input: player ID of both players}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} output: array of timestamps}
|
||||||
|
\PYG{k}{def} \PYG{n+nf}{same\PYGZus{}times}\PYG{p}{(}\PYG{n}{player1}\PYG{p}{,} \PYG{n}{player2}\PYG{p}{,} \PYG{n}{grid}\PYG{p}{):}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} get the times both players are in the given grid}
|
||||||
|
\PYG{n}{player1\PYGZus{}times} \PYG{o}{=} \PYG{n}{query}\PYG{p}{(}\PYG{l+s+s2}{\PYGZdq{}select times from table @ \PYGZdq{}} \PYG{o}{+} \PYG{n+nb}{hash}\PYG{p}{(}\PYG{n}{player1}\PYG{p}{,} \PYG{n}{grid}\PYG{p}{))}
|
||||||
|
\PYG{n}{player2\PYGZus{}times} \PYG{o}{=} \PYG{n}{query}\PYG{p}{(}\PYG{l+s+s2}{\PYGZdq{}select times from table @ \PYGZdq{}} \PYG{o}{+} \PYG{n+nb}{hash}\PYG{p}{(}\PYG{n}{player2}\PYG{p}{,} \PYG{n}{grid}\PYG{p}{))}
|
||||||
|
|
||||||
|
\PYG{n}{same\PYGZus{}times} \PYG{o}{=} \PYG{p}{[]}
|
||||||
|
|
||||||
|
\PYG{c+c1}{\PYGZsh{} index i for player1\PYGZus{}times and index j for player2\PYGZus{}times}
|
||||||
|
\PYG{n}{i} \PYG{o}{=} \PYG{n}{j} \PYG{o}{=} \PYG{l+m+mi}{0}
|
||||||
|
|
||||||
|
\PYG{k}{while} \PYG{n}{i} \PYG{o}{\PYGZlt{}} \PYG{n+nb}{len}\PYG{p}{(}\PYG{n}{player1\PYGZus{}times}\PYG{p}{)} \PYG{o+ow}{and} \PYG{n}{j} \PYG{o}{\PYGZlt{}} \PYG{n+nb}{len}\PYG{p}{(}\PYG{n}{player2\PYGZus{}times}\PYG{p}{):}
|
||||||
|
\PYG{k}{if} \PYG{n}{player1\PYGZus{}times}\PYG{p}{[}\PYG{n}{i}\PYG{p}{]} \PYG{o}{==} \PYG{n}{player2\PYGZus{}times}\PYG{p}{[}\PYG{n}{j}\PYG{p}{]:}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} both players are in the grid cell at this time}
|
||||||
|
\PYG{n}{same\PYGZus{}times}\PYG{o}{.}\PYG{n}{append}\PYG{p}{(}\PYG{n}{player1\PYGZus{}times}\PYG{p}{[}\PYG{n}{i}\PYG{p}{])}
|
||||||
|
\PYG{n}{i} \PYG{o}{+=} \PYG{l+m+mi}{1}
|
||||||
|
\PYG{n}{j} \PYG{o}{+=} \PYG{l+m+mi}{1}
|
||||||
|
\PYG{k}{else} \PYG{k}{if} \PYG{n}{player2\PYGZus{}times}\PYG{p}{[}\PYG{n}{j}\PYG{p}{]} \PYG{o}{\PYGZgt{}} \PYG{n}{player1\PYGZus{}times}\PYG{p}{[}\PYG{n}{i}\PYG{p}{]:}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} increment the player1 index (i) as it\PYGZsq{}s smaller than the time at j}
|
||||||
|
\PYG{n}{i} \PYG{o}{+=} \PYG{l+m+mi}{1}
|
||||||
|
\PYG{k}{else}\PYG{p}{:}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} increment the player2 index (j) as it\PYGZsq{}s smaller than the time at i}
|
||||||
|
\PYG{n}{j} \PYG{o}{+=} \PYG{l+m+mi}{1}
|
||||||
|
|
||||||
|
\PYG{k}{return} \PYG{n}{same\PYGZus{}times}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,27 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\},codes={\catcode`\$=3\catcode`\^=7\catcode`\_=8\relax}]
|
||||||
|
\PYG{c+c1}{\PYGZsh{} function to determine the number of times a given player was in a given grid}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} input: player ID, grid identifier}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} output: count of the number of times that player ID was recorded in that grid}
|
||||||
|
\PYG{k}{def} \PYG{n+nf}{player\PYGZus{}grid\PYGZus{}values}\PYG{p}{(}\PYG{n}{player\PYGZus{}id}\PYG{p}{,} \PYG{n}{grid\PYGZus{}id}\PYG{p}{):}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} get table location on disk}
|
||||||
|
\PYG{n}{table\PYGZus{}location} \PYG{o}{=} \PYG{n+nb}{hash}\PYG{p}{(}\PYG{n}{player\PYGZus{}id}\PYG{p}{,} \PYG{n}{grid\PYGZus{}id}\PYG{p}{)}
|
||||||
|
|
||||||
|
\PYG{c+c1}{\PYGZsh{} query the number of rows in that table using some pseudo-SQL nonsense}
|
||||||
|
\PYG{n}{count} \PYG{o}{=} \PYG{n}{select}\PYG{p}{(}\PYG{l+s+s2}{\PYGZdq{}count(*) from table @ \PYGZdq{}} \PYG{o}{+} \PYG{n}{table\PYGZus{}location}\PYG{p}{)}
|
||||||
|
|
||||||
|
\PYG{k}{return} \PYG{n}{count}
|
||||||
|
|
||||||
|
\PYG{c+c1}{\PYGZsh{} function to generate a an array of heatmap values for every location on the pitch}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} input: an array of locations on the pitch, an array of the player IDs in the game}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} output: an array numbers indicating the number of times a player was in each location}
|
||||||
|
\PYG{k}{def} \PYG{n+nf}{generate\PYGZus{}heatmap}\PYG{p}{(}\PYG{n}{locations}\PYG{p}{,} \PYG{n}{players}\PYG{p}{):}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} output array will hold the number of times a player was in each location, in the same order as the locations array}
|
||||||
|
\PYG{n}{heatmap\PYGZus{}values} \PYG{o}{=} \PYG{p}{[]} \PYG{c+c1}{\PYGZsh{} assume each index initialises to 0}
|
||||||
|
|
||||||
|
\PYG{c+c1}{\PYGZsh{} iterate over each location}
|
||||||
|
\PYG{k}{for} \PYG{p}{(}\PYG{n+nb}{int} \PYG{n}{i} \PYG{o}{=} \PYG{l+m+mi}{0}\PYG{p}{;} \PYG{n}{i} \PYG{o}{\PYGZlt{}} \PYG{n}{locations}\PYG{o}{.}\PYG{n}{length}\PYG{p}{;} \PYG{n}{i}\PYG{o}{++}\PYG{p}{):}
|
||||||
|
\PYG{c+c1}{\PYGZsh{} iterate over each player for each location}
|
||||||
|
\PYG{k}{for} \PYG{p}{(}\PYG{n+nb}{int} \PYG{n}{j} \PYG{o}{=} \PYG{l+m+mi}{0}\PYG{p}{;} \PYG{n}{j} \PYG{o}{\PYGZlt{}} \PYG{n}{players}\PYG{o}{.}\PYG{n}{length}\PYG{p}{;} \PYG{n}{j}\PYG{o}{++}\PYG{p}{):}
|
||||||
|
\PYG{n}{player\PYGZus{}id} \PYG{o}{=} \PYG{n}{players}\PYG{p}{[}\PYG{n}{j}\PYG{p}{]}
|
||||||
|
\PYG{n}{heatmap\PYGZus{}values}\PYG{p}{[}\PYG{n}{i}\PYG{p}{]} \PYG{o}{+=} \PYG{n}{player\PYGZus{}grid\PYGZus{}values}\PYG{p}{(}\PYG{n}{player\PYGZus{}id}\PYG{p}{,} \PYG{n}{location}\PYG{p}{)}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,76 @@
|
|||||||
|
|
||||||
|
\makeatletter
|
||||||
|
\def\PYG@reset{\let\PYG@it=\relax \let\PYG@bf=\relax%
|
||||||
|
\let\PYG@ul=\relax \let\PYG@tc=\relax%
|
||||||
|
\let\PYG@bc=\relax \let\PYG@ff=\relax}
|
||||||
|
\def\PYG@tok#1{\csname PYG@tok@#1\endcsname}
|
||||||
|
\def\PYG@toks#1+{\ifx\relax#1\empty\else%
|
||||||
|
\PYG@tok{#1}\expandafter\PYG@toks\fi}
|
||||||
|
\def\PYG@do#1{\PYG@bc{\PYG@tc{\PYG@ul{%
|
||||||
|
\PYG@it{\PYG@bf{\PYG@ff{#1}}}}}}}
|
||||||
|
\def\PYG#1#2{\PYG@reset\PYG@toks#1+\relax+\PYG@do{#2}}
|
||||||
|
|
||||||
|
\@namedef{PYG@tok@c}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cp}{\let\PYG@bf=\textbf\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cs}{\let\PYG@bf=\textbf\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@k}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kd}{\let\PYG@bf=\textbf\let\PYG@it=\textit}
|
||||||
|
\@namedef{PYG@tok@nb}{\let\PYG@bf=\textbf\let\PYG@it=\textit}
|
||||||
|
\@namedef{PYG@tok@bp}{\let\PYG@bf=\textbf\let\PYG@it=\textit}
|
||||||
|
\@namedef{PYG@tok@nn}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@nc}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@nf}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@nv}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@no}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@ow}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@s}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@err}{\def\PYG@bc##1{{\setlength{\fboxsep}{\string -\fboxrule}\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{\strut ##1}}}}
|
||||||
|
\@namedef{PYG@tok@kc}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kn}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kp}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kr}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kt}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@fm}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vc}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vg}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vi}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vm}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sa}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sb}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sc}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@dl}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sd}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@s2}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@se}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sh}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@si}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sx}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sr}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@s1}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@ss}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@ch}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cm}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cpf}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@c1}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
|
||||||
|
\def\PYGZbs{\char`\\}
|
||||||
|
\def\PYGZus{\char`\_}
|
||||||
|
\def\PYGZob{\char`\{}
|
||||||
|
\def\PYGZcb{\char`\}}
|
||||||
|
\def\PYGZca{\char`\^}
|
||||||
|
\def\PYGZam{\char`\&}
|
||||||
|
\def\PYGZlt{\char`\<}
|
||||||
|
\def\PYGZgt{\char`\>}
|
||||||
|
\def\PYGZsh{\char`\#}
|
||||||
|
\def\PYGZpc{\char`\%}
|
||||||
|
\def\PYGZdl{\char`\$}
|
||||||
|
\def\PYGZhy{\char`\-}
|
||||||
|
\def\PYGZsq{\char`\'}
|
||||||
|
\def\PYGZdq{\char`\"}
|
||||||
|
\def\PYGZti{\char`\~}
|
||||||
|
% for compatibility with earlier versions
|
||||||
|
\def\PYGZat{@}
|
||||||
|
\def\PYGZlb{[}
|
||||||
|
\def\PYGZrb{]}
|
||||||
|
\makeatother
|
||||||
|
|
Binary file not shown.
@ -0,0 +1,286 @@
|
|||||||
|
%! TeX program = lualatex
|
||||||
|
\documentclass[a4paper,11pt]{article}
|
||||||
|
% packages
|
||||||
|
\usepackage{fontspec}
|
||||||
|
\setmainfont{EB Garamond}
|
||||||
|
% for tironian et fallback
|
||||||
|
% % \directlua{luaotfload.add_fallback
|
||||||
|
% % ("emojifallback",
|
||||||
|
% % {"Noto Serif:mode=harf"}
|
||||||
|
% % )}
|
||||||
|
% % \setmainfont{EB Garamond}[RawFeature={fallback=emojifallback}]
|
||||||
|
|
||||||
|
\setmonofont[Scale=MatchLowercase]{Deja Vu Sans Mono}
|
||||||
|
\usepackage[a4paper,left=2cm,right=2cm,top=\dimexpr15mm+1.5\baselineskip,bottom=2cm]{geometry}
|
||||||
|
\setlength{\parindent}{0pt}
|
||||||
|
|
||||||
|
\usepackage{fancyhdr} % Headers and footers
|
||||||
|
\fancyhead[R]{\normalfont \leftmark}
|
||||||
|
\fancyhead[L]{}
|
||||||
|
\pagestyle{fancy}
|
||||||
|
|
||||||
|
\usepackage{amsmath}
|
||||||
|
\usepackage{microtype} % Slightly tweak font spacing for aesthetics
|
||||||
|
\usepackage[english]{babel} % Language hyphenation and typographical rules
|
||||||
|
\usepackage[final, colorlinks = false, urlcolor = cyan]{hyperref}
|
||||||
|
\usepackage{changepage} % adjust margins on the fly
|
||||||
|
|
||||||
|
\usepackage{minted}
|
||||||
|
\usemintedstyle{algol_nu}
|
||||||
|
\usepackage{xcolor}
|
||||||
|
|
||||||
|
\usepackage{pgfplots}
|
||||||
|
\pgfplotsset{width=\textwidth,compat=1.9}
|
||||||
|
|
||||||
|
\usepackage{caption}
|
||||||
|
\newenvironment{code}{\captionsetup{type=listing}}{}
|
||||||
|
\captionsetup[listing]{skip=0pt}
|
||||||
|
\setlength{\abovecaptionskip}{5pt}
|
||||||
|
\setlength{\belowcaptionskip}{5pt}
|
||||||
|
|
||||||
|
\usepackage[yyyymmdd]{datetime}
|
||||||
|
\renewcommand{\dateseparator}{--}
|
||||||
|
|
||||||
|
\renewcommand{\labelenumii}{\arabic{enumi}.\arabic{enumii}}
|
||||||
|
|
||||||
|
\usepackage{titlesec}
|
||||||
|
|
||||||
|
\author{Andrew Hayes, Conor McNamara, Maxwell Maia}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\begin{titlepage}
|
||||||
|
\begin{center}
|
||||||
|
\hrule
|
||||||
|
\vspace*{0.6cm}
|
||||||
|
\huge \textbf{CT3532 Database Systems II}
|
||||||
|
\vspace*{0.6cm}
|
||||||
|
\hrule
|
||||||
|
\LARGE
|
||||||
|
\vspace{0.5cm}
|
||||||
|
Assignment 2: Indexing
|
||||||
|
\vspace{0.5cm}
|
||||||
|
\hrule
|
||||||
|
|
||||||
|
\vfill
|
||||||
|
%\includegraphics[width=\textwidth]{images/uniog.jpg}
|
||||||
|
\vfill
|
||||||
|
|
||||||
|
\Large
|
||||||
|
\vspace{0.5cm}
|
||||||
|
\hrule
|
||||||
|
\vspace{0.5cm}
|
||||||
|
|
||||||
|
\raggedright
|
||||||
|
\begin{minipage}{0.329\textwidth}
|
||||||
|
\centering
|
||||||
|
\textbf{Andrew Hayes}
|
||||||
|
|
||||||
|
\normalsize
|
||||||
|
Student ID: 21321503
|
||||||
|
\end{minipage}
|
||||||
|
\begin{minipage}{0.329\textwidth}
|
||||||
|
\centering
|
||||||
|
\textbf{Conor McNamara}
|
||||||
|
|
||||||
|
\normalsize
|
||||||
|
Student ID: 21378116
|
||||||
|
\end{minipage}
|
||||||
|
\begin{minipage}{0.329\textwidth}
|
||||||
|
\centering
|
||||||
|
\textbf{Maxwell Maia}
|
||||||
|
|
||||||
|
\normalsize
|
||||||
|
Student ID: 21236277
|
||||||
|
\end{minipage}
|
||||||
|
|
||||||
|
\centering
|
||||||
|
%\today
|
||||||
|
|
||||||
|
\vspace{0.5cm}
|
||||||
|
\hrule
|
||||||
|
\end{center}
|
||||||
|
\end{titlepage}
|
||||||
|
|
||||||
|
\pagenumbering{roman}
|
||||||
|
\newpage
|
||||||
|
\tableofcontents
|
||||||
|
\newpage
|
||||||
|
\setcounter{page}{1}
|
||||||
|
\pagenumbering{arabic}
|
||||||
|
|
||||||
|
\newpage
|
||||||
|
\section{Indexing Approach \& Algorithm}
|
||||||
|
\subsection{Indexing Approach}\label{sec:indexingapproach}
|
||||||
|
\label{indexing}
|
||||||
|
To construct a heatmap which describes the location of a player over the course of a game, we need to be able to retrieve the number of times a player was in a given location over the course of a game.
|
||||||
|
We are given a data file for any given game that is already sorted in temporal order containing the timestamp, a triple containing a player's ID \& their $(x,y)$ co-ordinates, and a triple containing the $(x,y,z)$ co-ordinates of the football.
|
||||||
|
\\\\
|
||||||
|
We decided to make use of an indexing approach that was optimised specifically for facilitating queries about the amount of time a given player spent in a certain location.
|
||||||
|
We took advantage of the fact that the data is already sorted in temporal order, which saves us from having to sort the data at all.
|
||||||
|
At a high level, our indexing approach involved creating one table for each player-location pair, resulting in many tables, each containing one entry for each recorded instance of that particular player being in that particular location.
|
||||||
|
The location of the tables on the disk is determined by a hash function, with a hash on the player-location pair, allowing fast lookup.
|
||||||
|
\\\\
|
||||||
|
Our proposed indexing approach is as follows:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Define the grid cells that we will be examining for the creation of the heatmap.
|
||||||
|
The number \& size of these cells doesn't really matter from an algorithmic perspective, but the smaller / more numerous they are, the more tables we are going to need and the fewer grids we have, the less precise the heatmap data.
|
||||||
|
We can either define a custom number of grids, or we could take the grids that have already been defined for the gathering of the data -- each $(x,y)$ being a grid.
|
||||||
|
The trade-off here is either having a less precise heatmap but with fewer tables, or a more precise heatmap with many more tables.
|
||||||
|
If we take, for example, the dimensions of Croke Park (rounded for convenience) $145 \times 90$ metres, we can split the pitch into thirty $29 \times 15$ metre grid cells, as shown below.
|
||||||
|
This gives what we felt was a reasonable balance between precision \& having many tables, but these dimensions could be easily changed as needed for a practical implementation.
|
||||||
|
If we wanted ultimate precision, we could take each co-ordinate recorded in the data file to be a ``grid''.
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=0.7\textwidth]{./pitch.png}
|
||||||
|
\caption{A $145 \times 90$ Metre Pitch Split into Thirty $29 \times 15$ Metre Grid Cells}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\item Define a function which returns the grid cell given an $(x,y)$ pair.
|
||||||
|
We identify each grid with a name in the format \verb|x.y| where \verb|x| identifies the number of the column in which the grid is found (counted from left to right in the above figure) and \verb|y| is the number of the row in which the grid is found (counted from bottom to top in the above figure), e.g. the bottom left-hand grid would be referred to as \verb|0.0| and the top right-hand grid would be referred to as \verb|5.6|.
|
||||||
|
The following is a possible Python implementation of such a function, for our chosen pitch \& grid dimensions, assuming that the $x$-axis of the pitch is the long side and that $(x,y)$ co-ordinates of a player are just the number of metres that the player is from the bottom left-hand corner on the $x$ \& $y$ axes.
|
||||||
|
\begin{code}
|
||||||
|
\begin{minted}[texcl, mathescape, linenos, breaklines, frame=single]{Python}
|
||||||
|
# input: a pair of (x,y) co-ordinates
|
||||||
|
# output: a string that identifies the grid
|
||||||
|
def determine_grid(x, y):
|
||||||
|
grid_width = 29 # width of each grid in metres
|
||||||
|
grid_height = 15 # height of each grid in metres
|
||||||
|
|
||||||
|
# determining the location of the grid using floor division
|
||||||
|
grid_x = x // grid_width
|
||||||
|
grid_y = y // grid_height
|
||||||
|
|
||||||
|
# return the grid identifier in the format `x.y`
|
||||||
|
return str(grid_x) + "." + str(grid_y)
|
||||||
|
\end{minted}
|
||||||
|
\caption{\texttt{determine\_grid()}}
|
||||||
|
\end{code}
|
||||||
|
|
||||||
|
\item Define a hash function $h(\texttt{player\_id}, \texttt{grid}) \rightarrow i$ which hashes the player's ID \& the grid identifier together to generate the index $i$ which is the location of the table pertaining to that player-grid pair on the disk.
|
||||||
|
This table will be used to record each instance of the player being in that grid throughout the match.
|
||||||
|
Because this overall approach results in the generation of a great number of tables ($\# \text{players} \times \# \text{grids}$), it is important to have a strong hash function that minimises the chances of a collision occurring.
|
||||||
|
|
||||||
|
\item For each tuple in the data file:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Identify the grid which contains the player's co-ordinates using the \mintinline{python}{determine_grid(x,y)} function, supplying the player's $x$ \& $y$ co-ordinates to the function as arguments.
|
||||||
|
\item Hash the player's ID \& the grid identifier together using $h(\texttt{player\_id}, \texttt{grid})$ to generate the index $i$ of the table pertaining to that player-grid pair on the disk.
|
||||||
|
\item Store the minimum data required from the original tuple in the table at index $i$, creating the table if it does not already exist.
|
||||||
|
Because we are iterating over data that is already sorted by timestamp, our insertions into the database will also be sorted, saving us from having to sort them ourselves.
|
||||||
|
We will calculate the amount of time spent in the grid by the number of entries in this table, so we assume that the data is always recorded at a regular interval, say 100ms.
|
||||||
|
The amount of time spent in the location will then be calculated by multiplying the count of entries in that table by the interval at which data was recorded, e.g. if the interval was 100ms and there were 6000 entries in the table, then we could say that the player spent $6000 \times 100 = 600000$ milliseconds in that location (ten minutes).
|
||||||
|
The obvious downside of this approach is that if data is not always recorded at a regular interval, it will not work.
|
||||||
|
In that case, we could additionally store the amount of time that elapsed before the next entry for that player was recorded in the data file as a column in the table.
|
||||||
|
This would of course require extra processing, as we would have to seek out the next instance of the player ID in the data file, get the timestamp of that entry, and then subtract the timestamp of the current entry from the timestamp of the next recorded timestamp for that player to calculate the interval, but it would allow a similar querying approach: to calculate the amount of time a player spent in the location, we could just sum the values in that column.
|
||||||
|
\\\\
|
||||||
|
The minimum data required will vary depending on the type of queries that we intend to execute; theoretically, if the only thing we wanted to know was the amount of time that a player spent in the grid and the data was always recorded at regular intervals, we could get away with not storing any data other than an iterative ID for each row that counts the number of times the player was recorded in that location.
|
||||||
|
We decided not to opt for this approach, instead opting to store everything from the original tuple except the player's ID (as that is implicit for the table), which allows us to execute the maximum number of different queries that we can, such as querying the distance from the player to the ball when the player is in that location for example.
|
||||||
|
Alternatively, we could also just store a pointer to the original tuple in the data file in the table, thus ensuring there is no duplication of data but we decided against this as it would require each tuple to be parsed for every query executed, rather than just parsing the tuples out into their respective columns once.
|
||||||
|
\end{enumerate}
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsection{Algorithm}
|
||||||
|
Our proposed algorithm for calculating the amount of time that a player spent at a given location is largely outlined above, but we will lay it out here in a more concise manner.
|
||||||
|
Given the player's ID and either a pair of $(x,y)$ co-ordinates in the desired location, or the grid identifier of the given location:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Identify the grid location, if not given, using the \mintinline{python}{determine_grid(x, y)} function given above.
|
||||||
|
\item Get the index $i$ of the player-grid table using $h(\texttt{player\_id}, \texttt{grid})$.
|
||||||
|
\item Query the length of the table at index $i$.
|
||||||
|
\item Multiply the length of the table $L$ by the interval at which data was recorded $\Delta$ to get the time spent in the grid $t = L \times \Delta$.
|
||||||
|
E.g., if $\Delta = 100\text{ms}$ and $L = 6000$, then the amount of time spent in the grid is $t = 6000 \times 100 = 600000\text{ms}$, or ten minutes.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\section{Parallelisation}
|
||||||
|
One advantage of the chosen approach is that it relies on \& maintains the sorted order of the original data file when entering data into the tables.
|
||||||
|
Although the tables being in order is not strictly necessary for the primary type of query we are discussing here (querying the amount of time that a player spent at a location over the course of a game), it is highly advantageous to have the tables already in sorted order, as it saves us from having to sort them if we wanted to, for example, query the amount of time a player spent in a location in the second half of the match.
|
||||||
|
For this reason, we will want to ensure that the temporal order of the data is maintained when parallelising the approach.
|
||||||
|
\\\\
|
||||||
|
We could split the data file into $n$ equal chunks, where $n$ is the number of CPUs available, and then have each CPU apply the approach outlined above to process their chunks, but then the data would not be inserted into each table in sorted order; we would then have to sort each table using in some manner (such as a two-pointer approach) to end up with the sorted tables.
|
||||||
|
Instead, to keep in line with our existing indexing approach, and maintain the sorted order of the data file from the beginning, we opted for the following approach:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Iterate over the data file and pull out the set of player IDs contained within it, e.g. for a match of Gaelic Football with no substitutions, we would have $30$ player IDs.
|
||||||
|
\item Assign each CPU available to us a set of player IDs that it will be responsible for, ensuring no overlap.
|
||||||
|
The split of the player IDs among the CPUs should be as equal as possible, assuming equally powerful CPUs.
|
||||||
|
\item For each player ID in its set, each CPU will iterate through the data file in order, searching for the tuples which contain the player ID currently under consideration.
|
||||||
|
Each tuple will be handled with the approach outlined previously (identify grid, get index of player-grid table, insert data).
|
||||||
|
This maintains the primary advantage of the original indexing approach: we take advantage of the data already being sorted and avoid having to re-sort it.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
|
||||||
|
% ## Parallelize query ?
|
||||||
|
% - Split the return table from hash(p_id, grid cell) into chunks and assign to different CPUs using round robin (as just want an equal split of work between CPUs)
|
||||||
|
|
||||||
|
\section{\boldmath{$3 \times 3$} Grid Heatmap}\label{sec:heatmap}
|
||||||
|
The indexing approach outlined in the section \textbf{\hyperref[sec:indexingapproach]{Indexing Approach}} would work for the coaches' queries about the times in which a given player was in a specified rectangle of the $3 \times 3$ grid.
|
||||||
|
The indexing approach would be completely unchanged, except the grid definitions would be different, with both different grid dimensions and a different number of tables.
|
||||||
|
For each tuple in the data file, the grid would be identified from the $(x,y)$ co-ordinates using the \mintinline{python}{determine_grid()} function, the player ID \& the grid identifier would be hashed together to find the location of the relevant table on disk, and the data would be inserted.
|
||||||
|
The number of times that a player was in a given grid could be determined by simply hashing the player ID \& the grid identifier together to get the location of the relevant table on disk, and querying the length of that table to find the number of entries.
|
||||||
|
\\\\
|
||||||
|
The following pseudo-code could be used to calculate the values for a heatmap for each player across the entire pitch:
|
||||||
|
\begin{code}
|
||||||
|
\begin{minted}[texcl, mathescape, linenos, breaklines, frame=single]{Python}
|
||||||
|
# function to determine the number of times a given player was in a given grid
|
||||||
|
# input: player ID, grid identifier
|
||||||
|
# output: count of the number of times that player ID was recorded in that grid
|
||||||
|
def player_grid_values(player_id, grid_id):
|
||||||
|
# get table location on disk
|
||||||
|
table_location = hash(player_id, grid_id)
|
||||||
|
|
||||||
|
# query the number of rows in that table using some pseudo-SQL nonsense
|
||||||
|
count = select("count(*) from table @ " + table_location)
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
# function to generate a an array of heatmap values for every location on the pitch
|
||||||
|
# input: an array of locations on the pitch, an array of the player IDs in the game
|
||||||
|
# output: an array numbers indicating the number of times a player was in each location
|
||||||
|
def generate_heatmap(locations, players):
|
||||||
|
# output array will hold the number of times a player was in each location, in the same order as the locations array
|
||||||
|
heatmap_values = [] # assume each index initialises to 0
|
||||||
|
|
||||||
|
# iterate over each location
|
||||||
|
for (int i = 0; i < locations.length; i++):
|
||||||
|
# iterate over each player for each location
|
||||||
|
for (int j = 0; j < players.length; j++):
|
||||||
|
player_id = players[j]
|
||||||
|
heatmap_values[i] += player_grid_values(player_id, location)
|
||||||
|
\end{minted}
|
||||||
|
\caption{Pseudo-Code to Calculate the Heatmap Values}
|
||||||
|
\end{code}
|
||||||
|
|
||||||
|
\section{Identify When Two Players Are in the Same Location}
|
||||||
|
To find when two players from either team are in the same location, we can use the exact same indexing approach as in \textbf{\hyperref[sec:heatmap]{\boldmath{$3 \times 3$} Grid Heatmap}}.
|
||||||
|
The following pseudo-code could be used to query the times when two given players are in the same grid location:
|
||||||
|
\begin{code}
|
||||||
|
\begin{minted}[linenos, breaklines, frame=single]{Python}
|
||||||
|
# function to return the times when two given players are in the same location
|
||||||
|
# input: player ID of both players
|
||||||
|
# output: array of timestamps
|
||||||
|
def same_times(player1, player2, grid):
|
||||||
|
# get the times both players are in the given grid
|
||||||
|
player1_times = query("select times from table @ " + hash(player1, grid))
|
||||||
|
player2_times = query("select times from table @ " + hash(player2, grid))
|
||||||
|
|
||||||
|
same_times = []
|
||||||
|
|
||||||
|
# index i for player1_times and index j for player2_times
|
||||||
|
i = j = 0
|
||||||
|
|
||||||
|
while i < len(player1_times) and j < len(player2_times):
|
||||||
|
if player1_times[i] == player2_times[j]:
|
||||||
|
# both players are in the grid cell at this time
|
||||||
|
same_times.append(player1_times[i])
|
||||||
|
i += 1
|
||||||
|
j += 1
|
||||||
|
else if player2_times[j] > player1_times[i]:
|
||||||
|
# increment the player1 index (i) as it's smaller than the time at j
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
# increment the player2 index (j) as it's smaller than the time at i
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
return same_times
|
||||||
|
\end{minted}
|
||||||
|
\caption{Pseudo-Code to Query the Times When Two Players Are in the Same Location}
|
||||||
|
\end{code}
|
||||||
|
|
||||||
|
\end{document}
|
Binary file not shown.
After Width: | Height: | Size: 92 KiB |
Binary file not shown.
Binary file not shown.
@ -0,0 +1,640 @@
|
|||||||
|
%! TeX program = lualatex
|
||||||
|
\documentclass[a4paper,11pt]{article}
|
||||||
|
% packages
|
||||||
|
\usepackage{censor}
|
||||||
|
\StopCensoring
|
||||||
|
\usepackage{fontspec}
|
||||||
|
\setmainfont{EB Garamond}
|
||||||
|
% for tironian et fallback
|
||||||
|
% % \directlua{luaotfload.add_fallback
|
||||||
|
% % ("emojifallback",
|
||||||
|
% % {"Noto Serif:mode=harf"}
|
||||||
|
% % )}
|
||||||
|
% % \setmainfont{EB Garamond}[RawFeature={fallback=emojifallback}]
|
||||||
|
|
||||||
|
\setmonofont[Scale=MatchLowercase]{Deja Vu Sans Mono}
|
||||||
|
\usepackage[a4paper,left=2cm,right=2cm,top=\dimexpr15mm+1.5\baselineskip,bottom=2cm]{geometry}
|
||||||
|
\setlength{\parindent}{0pt}
|
||||||
|
|
||||||
|
\usepackage{fancyhdr} % Headers and footers
|
||||||
|
\fancyhead[R]{\normalfont \leftmark}
|
||||||
|
\fancyhead[L]{}
|
||||||
|
\pagestyle{fancy}
|
||||||
|
|
||||||
|
\usepackage{microtype} % Slightly tweak font spacing for aesthetics
|
||||||
|
\usepackage[english]{babel} % Language hyphenation and typographical rules
|
||||||
|
\usepackage[final, colorlinks = false, urlcolor = cyan]{hyperref}
|
||||||
|
\usepackage{changepage} % adjust margins on the fly
|
||||||
|
|
||||||
|
\usepackage{minted}
|
||||||
|
\usemintedstyle{algol_nu}
|
||||||
|
\usepackage{xcolor}
|
||||||
|
|
||||||
|
\usepackage{pgfplots}
|
||||||
|
\pgfplotsset{width=\textwidth,compat=1.9}
|
||||||
|
|
||||||
|
\usepackage{caption}
|
||||||
|
\newenvironment{code}{\captionsetup{type=listing}}{}
|
||||||
|
\captionsetup[listing]{skip=0pt}
|
||||||
|
\setlength{\abovecaptionskip}{5pt}
|
||||||
|
\setlength{\belowcaptionskip}{5pt}
|
||||||
|
|
||||||
|
\usepackage[yyyymmdd]{datetime}
|
||||||
|
\renewcommand{\dateseparator}{--}
|
||||||
|
|
||||||
|
\usepackage{enumitem}
|
||||||
|
|
||||||
|
\usepackage{titlesec}
|
||||||
|
|
||||||
|
% \author{Andreas Ó hAodha}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\begin{titlepage}
|
||||||
|
\begin{center}
|
||||||
|
\hrule
|
||||||
|
\vspace*{0.6cm}
|
||||||
|
\censor{\huge \textbf{CT3532}}
|
||||||
|
\vspace*{0.6cm}
|
||||||
|
\hrule
|
||||||
|
\LARGE
|
||||||
|
\vspace{0.5cm}
|
||||||
|
Assignment 3: Graphs
|
||||||
|
\vspace{0.5cm}
|
||||||
|
\hrule
|
||||||
|
|
||||||
|
\footnotesize
|
||||||
|
\vfill
|
||||||
|
% \begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\begin{tikzpicture}
|
||||||
|
% Define the nodes
|
||||||
|
\foreach \x in {1,...,11}
|
||||||
|
\node[circle,draw] (node\x) at ({360/11 * (\x - 1)}:8) {Player $\x$};
|
||||||
|
|
||||||
|
% Define the edges and their distances
|
||||||
|
\foreach \i in {1,...,11} {
|
||||||
|
\foreach \j in {\i,...,11} {
|
||||||
|
\ifnum\i=\j\relax
|
||||||
|
\else
|
||||||
|
% \pgfmathsetmacro\distance{rand} % Generate random distances
|
||||||
|
% \draw (node\i) -- (node\j) node[midway, sloped, above] {\distance};
|
||||||
|
\draw (node\i) -- (node\j) ;
|
||||||
|
\fi
|
||||||
|
}
|
||||||
|
}
|
||||||
|
\end{tikzpicture}
|
||||||
|
% \caption{An Example Graph with 11 Nodes Representing a Team}
|
||||||
|
% \end{figure}
|
||||||
|
\vfill
|
||||||
|
|
||||||
|
% \vspace{1.6cm}
|
||||||
|
\hrule
|
||||||
|
\begin{minipage}{0.495\textwidth}
|
||||||
|
\vspace{0.4em}
|
||||||
|
\raggedright
|
||||||
|
\normalsize
|
||||||
|
Name: \censor{Andrew Hayes} \\
|
||||||
|
E-mail: \censor{\href{mailto://a.hayes18@universityofgalway.ie}{\texttt{a.hayes18@universityofgalway.ie}}} \hfill\\
|
||||||
|
ID: \censor{21321503} \hfill
|
||||||
|
\end{minipage}
|
||||||
|
\begin{minipage}{0.495\textwidth}
|
||||||
|
\raggedleft
|
||||||
|
\vspace*{0.8cm}
|
||||||
|
\Large
|
||||||
|
\today
|
||||||
|
\vspace*{0.6cm}
|
||||||
|
\end{minipage}
|
||||||
|
\medskip\hrule
|
||||||
|
\end{center}
|
||||||
|
\end{titlepage}
|
||||||
|
|
||||||
|
\pagenumbering{roman}
|
||||||
|
\newpage
|
||||||
|
\tableofcontents
|
||||||
|
\newpage
|
||||||
|
\setcounter{page}{1}
|
||||||
|
\pagenumbering{arabic}
|
||||||
|
|
||||||
|
\section{Representing the Graph in a Relational Database}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=0.8\textwidth]{./images/schema.png}
|
||||||
|
\caption{Database Schema Diagram}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
|
||||||
|
The key challenge with representing a complete graph in a database is the number of edges on each node.
|
||||||
|
If we take an 11-player soccer team for example, each node will have 10 edges (one joining it to each other node), resulting in a graph like the example
|
||||||
|
graph shown on the cover page.
|
||||||
|
We also need to take into consideration that each edge has distance information associated with it, but no direction
|
||||||
|
information.
|
||||||
|
This means that storing the edges within the node somehow is not a good idea:
|
||||||
|
since there is no direction information, we would have to decide arbitrarily which node to store an edge with, or duplicate it across both nodes.
|
||||||
|
Furthermore, it is not easy to say with certainty the number of edges that we will wish to store for each node at any given time, making it difficult to represent them as, say, 10 distinct columns in a
|
||||||
|
table.
|
||||||
|
For example, if a player gets sent off and there are no other players available to be substituted in their place, suddenly each node would have only 9 edges, or if we decided that we weren't interested in
|
||||||
|
storing an edge if the distance between the players was too great to be meaningful, the number of edges stored per node would be variable.
|
||||||
|
I make the assumption that the graphs in questions are not multigraphs, i.e. there can be no edge joining a node to itself (I can't think of what that would even
|
||||||
|
mean in this context, as the distance from a player to themselves would naturally always be 0).
|
||||||
|
\\\\
|
||||||
|
My proposed approach for representing the graph in a relational database is to have separate node \& edge tables.
|
||||||
|
Because each node represents a player and each edge represents the distance between two players, the tables will be named
|
||||||
|
\mintinline{sql}{players} \& \mintinline{sql}{distances} respectively to relate them easily to the physical reality that they represent.
|
||||||
|
We must also consider that the graph may be generated several times throughout a game, so we should be able to distinguish which entries in a given table pertain to which graphs.
|
||||||
|
For this reason, I will also make use of a third table called \mintinline{sql}{graphs}, which every entry in the \mintinline{sql}{distances} table will have a foreign key to, allowing us to identify the
|
||||||
|
graph to which that entry pertains.
|
||||||
|
|
||||||
|
\subsection{The \texttt{graphs} Table}
|
||||||
|
The \mintinline{sql}{graphs} table will be used to identify each graph.
|
||||||
|
The table will have three columns:
|
||||||
|
\begin{itemize}
|
||||||
|
\item An auto-incrementing integer column called \mintinline{sql}{graph_id} which will serve as the table's primary key.
|
||||||
|
This key will be referenced by each row in the \mintinline{sql}{distances} table so that the graph to which the data in the row relates can be identified.
|
||||||
|
An auto-incrementing ID was chosen as opposed to a composite key of the game's ID \& the time of the graph's generation so as to reduce the duplication of data across tables.
|
||||||
|
|
||||||
|
\item An integer column called \mintinline{sql}{game_id} which uniquely identifies the game to which the graph pertains.
|
||||||
|
We assume that this is defined elsewhere, either in a \mintinline{sql}{games} table or simply on an iterative basis, where the $n\textsuperscript{th}$ game played gets a
|
||||||
|
\mintinline{sql}{game_id} of $n$.
|
||||||
|
The SQL code examples will make the assumption that a \mintinline{sql}{games} table exists, but since this table is irrelevant to the representation of the graph and could be easily done
|
||||||
|
without, I won't bother defining what columns it should contain or how to create it; the only column that it must contain is a \mintinline{sql}{game_id}.
|
||||||
|
|
||||||
|
\item A \mintinline{sql}{TIMESTAMP} column called \mintinline{sql}{time_generated} which represents the time at which the graph was generated.
|
||||||
|
I opted to use a standard \mintinline{sql}{TIMESTAMP} instead of a variable representing the amount of game time that has elapsed for the sake of simplicity \& flexibility.
|
||||||
|
This column would constitute a candidate key if there was a guarantee that no two matches would be played simultaneously, and therefore we wouldn't need the \mintinline{sql}{graph_id} or
|
||||||
|
\mintinline{sql}{game_id}.
|
||||||
|
However, I feel that this would be an unreasonable assumption to make and would result in a less robust schema so I opted for the two extra columns instead.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsubsection{SQL Code to Create the \texttt{graphs} Table}
|
||||||
|
\begin{code}
|
||||||
|
\begin{minted}[linenos, breaklines, frame=single]{sql}
|
||||||
|
CREATE TABLE graphs (
|
||||||
|
graph_id INT NOT NULL AUTO_INCREMENT,
|
||||||
|
game_id INT NOT NULL,
|
||||||
|
time_generated TIMESTAMP NOT NULL,
|
||||||
|
|
||||||
|
PRIMARY KEY (graph_id),
|
||||||
|
FOREIGN KEY (game_id) REFERENCES games(game_id) -- assuming that a `graphs` table exists elsewhere
|
||||||
|
)
|
||||||
|
\end{minted}
|
||||||
|
\caption{SQL Code to Create the \texttt{graphs} Table}
|
||||||
|
\end{code}
|
||||||
|
|
||||||
|
\subsection{The \texttt{players} Table}
|
||||||
|
The details of the player table depend on what kind of data we want to store per node.
|
||||||
|
I am operating under the assumption that we do not intend on storing the $(x,y)$ co-ordinates of the player for each node on the graph.
|
||||||
|
This allows us to use some arbitrary \mintinline{sql}{players} table which contains whatever information we want to store on each player such as name, age, team, etc.
|
||||||
|
The specifics of what is stored in the \mintinline{sql}{players} table are largely irrelevant to the implementation of my schema, the only requirement is that there is an unique integer player ID for
|
||||||
|
each player that is unique across teams and games.
|
||||||
|
For this reason, I will operate with the most bare-bones possible player table which will contain an auto-incrementing integer player ID and nothing else, but in a practical scenario this table ought to
|
||||||
|
include columns such as the name of the team to which the player belongs, the player's name, the player's squad number, etc.
|
||||||
|
\\\\
|
||||||
|
If, however, we wanted to store the $(x,y)$ co-ordinates of each player with each node, it would be best to not use the \mintinline{sql}{players} table to represent the nodes; instead we should define a
|
||||||
|
\mintinline{sql}{nodes} table which has a foreign key to the \mintinline{sql}{players} table, a foreign key to the \mintinline{sql}{graphs} table, \& the $(x,y)$ co-ordinates of the player at the time
|
||||||
|
when the graph was generated.
|
||||||
|
|
||||||
|
\subsubsection{SQL Code to Create the \texttt{players} Table}
|
||||||
|
\begin{code}
|
||||||
|
\begin{minted}[linenos, breaklines, frame=single]{sql}
|
||||||
|
CREATE TABLE players (
|
||||||
|
player_id INT NOT NULL AUTO_INCREMENT,
|
||||||
|
-- whatever other relevant information for each player should be included here
|
||||||
|
|
||||||
|
PRIMARY KEY (player_id)
|
||||||
|
)
|
||||||
|
\end{minted}
|
||||||
|
\caption{SQL Code to Create the \texttt{players} Table}
|
||||||
|
\end{code}
|
||||||
|
|
||||||
|
\subsection{The \texttt{distances} Table}
|
||||||
|
The purpose of the \mintinline{sql}{distances} table is to represent the edges between each pair of nodes.
|
||||||
|
It must store the IDs of two nodes (or players) and the distance between them.
|
||||||
|
However, since each edge is undirected, there is no easy way to say which node should be stored in which column of the table.
|
||||||
|
To ensure that the ordering of the node pairs remains consistent, I will store the node with the lower player ID in the first column and the node with the higher player ID in the second column.
|
||||||
|
The graph will consist of three columns:
|
||||||
|
\begin{itemize}
|
||||||
|
\item The \mintinline{sql}{graph_id} of the graph to which this edge belongs, which will reference the \mintinline{sql}{graphs} table.
|
||||||
|
|
||||||
|
\item A column named \mintinline{sql}{player1} which will hold the player ID of the first player in the pair of nodes that the edge joins.
|
||||||
|
The player ID stored in this column will be the lesser of the two player IDs in question, so that we can ensure that the ordering remains consistent.
|
||||||
|
This column will be a foreign key that references the \mintinline{sql}{players} table.
|
||||||
|
|
||||||
|
\item A column named \mintinline{sql}{player2} which will hold the player ID of the second player in the pair of nodes that the edge joins.
|
||||||
|
The player ID stored in this column will be the greater of the two player IDs in question, so that we can ensure that the ordering remains consistent.
|
||||||
|
This column will also be a foreign key that references the \mintinline{sql}{players} table.
|
||||||
|
|
||||||
|
\item A column named \mintinline{sql}{distance} which will store the distance between the two players at the time the graph was generated.
|
||||||
|
Assuming that this is measured in metres or some similar unit, this column will need to be a floating point number.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
Because I don't expect to have any foreign keys referencing this table, it is likely more efficient in terms of space to use a composite key comprised of the columns \mintinline{sql}{graph_id},
|
||||||
|
\mintinline{sql}{player1}, \& \mintinline{sql}{player1} instead of having an auto-incrementing \mintinline{sql}{distance_id} column.
|
||||||
|
|
||||||
|
\subsubsection{SQL Code to Create the \texttt{distances} Table}
|
||||||
|
\begin{code}
|
||||||
|
\begin{minted}[linenos, breaklines, frame=single]{sql}
|
||||||
|
CREATE TABLE distances (
|
||||||
|
graph_id INT NOT NULL,
|
||||||
|
player1 INT NOT NULL,
|
||||||
|
player2 INT NOT NULL,
|
||||||
|
distance FLOAT,
|
||||||
|
|
||||||
|
PRIMARY KEY (graph_id, player1, player2),
|
||||||
|
FOREIGN KEY (graph_id) REFERENCES graphs(graph_id),
|
||||||
|
FOREIGN KEY (player1) REFERENCES players(player_id),
|
||||||
|
FOREIGN KEY (player2) REFERENCES players(player_id),
|
||||||
|
)
|
||||||
|
\end{minted}
|
||||||
|
\caption{SQL Code to Create the \texttt{distances} Table}
|
||||||
|
\end{code}
|
||||||
|
|
||||||
|
\section{Representing the Data in a Data Structure} \label{sec:data_structure}
|
||||||
|
One of the most obvious ways to represent this data in a data structure (to me, at least) is to take an object-oriented programming approach.
|
||||||
|
My proposed data structure would be a hierarchy in which objects would be encapsulated within other objects.
|
||||||
|
There would be a class of \mintinline{java}{Game} objects which would contain all the data related to a game, including a set of \mintinline{java}{Graph} objects.
|
||||||
|
These \mintinline{java}{Graph} objects would contain all the data related to a graph generated at a moment in time during the game, including a timestamp of when the graph was generated and a set of
|
||||||
|
\mintinline{java}{Edge} objects.
|
||||||
|
Each of these \mintinline{java}{Edge} objects would contain two \mintinline{java}{Player} objects and the distance between them.
|
||||||
|
Finally, the \mintinline{java}{Player} objects would each contain data about the player which they represent, such as name, team name, etc.
|
||||||
|
The \mintinline{java}{Player} objects would be contained within the \mintinline{java}{Edge} objects using an unordered set, as the edges are undirected.
|
||||||
|
The choice to store \mintinline{java}{Player} (node) objects inside \mintinline{java}{Edge} objects instead of vice-versa is because the number of nodes per edge
|
||||||
|
is known to always be 2, whereas a node could have as many edges as there are other nodes in the graph.
|
||||||
|
\\\\
|
||||||
|
An obvious question that arises from this proposed data is one of data duplication: if each edge contains two players, would that not mean that each player object is duplicated for every edge object
|
||||||
|
between it and another player?
|
||||||
|
We can avoid duplicating data by representing this data structure in a language such as Java, as Java allows us to make reference to objects within several different objects, using what is essentially a
|
||||||
|
data pointer.
|
||||||
|
Therefore, two (or 10) \mintinline{java}{Edge} objects could make reference to the same \mintinline{java}{Player} object without duplicating the data contained within that \mintinline{java}{Player} object.
|
||||||
|
|
||||||
|
\subsection{Code to Represent the Data in a Data Structure}
|
||||||
|
The following (highly simplified) Java code could be used to represent the proposed classes:
|
||||||
|
\begin{code}
|
||||||
|
\begin{minted}[texcl, mathescape, linenos, breaklines, frame=single]{Java}
|
||||||
|
// Note that in Java, there can be only one `public` class per file
|
||||||
|
// Therefore, if this code were to be actually used, each class must be in its own `*.java` file
|
||||||
|
|
||||||
|
public class Game {
|
||||||
|
// potential data fields that could be contained within the Game class
|
||||||
|
public int gameId;
|
||||||
|
public String homeTeam;
|
||||||
|
public String awayTeam;
|
||||||
|
|
||||||
|
// HashMap of Graph objects encapsulated within the Game object
|
||||||
|
public HashMap<LocalDateTime, Graph> graphs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Graph {
|
||||||
|
public LocalDateTime timeGenerated; // need to import java.time.LocalDateTime for this to work
|
||||||
|
|
||||||
|
// set of Edge objects encapsulated within the Graph object
|
||||||
|
public Set<Edge> edges;
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Edge {
|
||||||
|
public float distance;
|
||||||
|
|
||||||
|
// set of Player objects encapsulated within the Edge object - there should be no more than 2
|
||||||
|
public Set<Player> nodes;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Player {
|
||||||
|
// potential data fields that would be contained within the Player class
|
||||||
|
public String name;
|
||||||
|
public int playerId; // assuming each player in the league has a unique ID
|
||||||
|
}
|
||||||
|
\end{minted}
|
||||||
|
\caption{Sample Java Code to Represent the Proposed Data Structure}
|
||||||
|
\end{code}
|
||||||
|
|
||||||
|
\subsection{Alternative Data Structures}
|
||||||
|
Another good (perhaps more conventional) choice of data structure to represent a graph would be to use an Adjacency Matrix.
|
||||||
|
This has the benefit of being very simple: it is simply a 2D array with a row \& column for each node.
|
||||||
|
Nodes that share an edge should have the length of this edge recorded in the cells at the intersections between their rows \& columns.
|
||||||
|
One drawback of the adjacency matrix is the data duplication: we store each value twice.
|
||||||
|
It also doesn't make it very easy to store information on the nodes in an easily retrievable manner.
|
||||||
|
It also is not very easy to change the size of dynamically.
|
||||||
|
However, it is simple, and highly programming-paradigm agnostic, so it too would be a good choice.
|
||||||
|
|
||||||
|
\section{Algorithm to Measure the Similarity of Two Graphs}
|
||||||
|
I think that the most appropriate algorithm for measuring the similarity of two graphs representing this kind of data is
|
||||||
|
Graph Edit Distance.
|
||||||
|
Graph Edit Distance measures the similarity of two graphs by counting the number of primitive graph operations that would
|
||||||
|
be required to transform one of the graphs into the other.
|
||||||
|
I feel that this approach is particularly appropriate in the context of sports teams, as we can compare the number of
|
||||||
|
graph operations required to turn one graph into another to the number of player movements required to turn one formation
|
||||||
|
into another.
|
||||||
|
\\\\
|
||||||
|
Getting the GED of two graphs is greatly simplified if the nodes can be set up in a one-to-one correspondence.
|
||||||
|
If such a relationship between nodes could be established for our data, then the task of computing the number of operations
|
||||||
|
required to transform one graph into the other would be much easier.
|
||||||
|
Getting the identity of player nodes is more or less difficult depending on the sport in question:
|
||||||
|
\begin{itemize}
|
||||||
|
\item For most team sports, players have set positions on a team.
|
||||||
|
This makes it easy to establish equivalenced between nodes: if we were comparing two rugby team graphs, we
|
||||||
|
could compare the scrum-half node from one graph to the scrum-half node of the other.
|
||||||
|
I am making the simplifying assumption that when we compare two graphs, the teams represented in those two
|
||||||
|
graphs will be teams playing the same sport.
|
||||||
|
|
||||||
|
\item For some team sports however, such as soccer, there are no set positions.
|
||||||
|
There are common positions and formations, but these are not set in stone.
|
||||||
|
This makes it much harder to establish a correspondence between equivalent nodes in different graphs.
|
||||||
|
My proposed solution for establishing node identity for this type of sport is to identify players by their
|
||||||
|
distance from some mostly stationary player.
|
||||||
|
In the case of soccer, players would be identified by their distance from the goalkeeper of their team;
|
||||||
|
in the majority of cases, the closest players will be the backs, the furthest will be the forwards, and the
|
||||||
|
middle players would be the midfielders.
|
||||||
|
This is a less precise correspondence than the one for sports with set positions, as the left back and the
|
||||||
|
right back for example might be of essentially the same distance from the goalkeeper causing the identities to
|
||||||
|
be confused, but without having set positions, it is difficult to compare the graphs of two potentially entirely
|
||||||
|
unrelated teams.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
Before calculating the Graph Edit Distance, we must also define what we consider to be primitive graph operations in this
|
||||||
|
context.
|
||||||
|
Since we are dealing with a very specific type of data with weighted edges, I will be using some specific graph operations.
|
||||||
|
I will not be using ``substitution'' operations, as this is not something that I think is important to this analysis, i.e.
|
||||||
|
if we are comparing two team graphs, and the formation of the two is identical, but one has John Smith playing as goalkeeper
|
||||||
|
and the other has Joe Murphy, we don't care about the specific identity of the nodes.
|
||||||
|
Since the formation is the same, we should consider the graphs to be identical without having to substitute the John Smith
|
||||||
|
node for the Joe Murphy node.
|
||||||
|
The primitive graph operations that I will be considering are the following:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Node insertion \& deletion: if a player is missing from one graph, to transform one graph into the other we
|
||||||
|
will need to insert or delete a node.
|
||||||
|
|
||||||
|
\item Distance lengthening \& shortening: if the weight of an edge between two nodes is greater in one graph, to
|
||||||
|
transform one graph into the other we will need to lengthen or shorten the distance between the nodes.
|
||||||
|
I am assuming here that we are dealing only with complete graphs, i.e. each node is adjacent to every other node.
|
||||||
|
For a simple measurement of how different the two graphs are, I am going to treat the distances between nodes
|
||||||
|
as if one can be changed without changing all the others.
|
||||||
|
Of course, on a real pitch, if the distance between two players was to get shorter, it would mean that one or both
|
||||||
|
of them had moved, in turn changing the distances between them and the other players.
|
||||||
|
I am going to count changing the length of an edge by 1 unit to be a simple operation, so if an edge were
|
||||||
|
to be transformed to be 5 units shorter, that would be 5 operations, and if it were to be made 10 units longer,
|
||||||
|
that would be 10 units.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
Essentially, my approach can be boiled down to establishing a one-to-one correspondence between the nodes, and then comparing
|
||||||
|
the different edge lengths that those equivalent nodes have, and summing up the absolute value of the differences to get a
|
||||||
|
dissimilarity score, with the higher the score, the more dissimilar the graphs.
|
||||||
|
The algorithm would be as follows:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Establish a one-to-one correspondence between the nodes in the graph, by position if applicable for the sport,
|
||||||
|
or by distance from some root player (such as the goalkeeper) otherwise.
|
||||||
|
If one graph has more nodes than the other, note this.
|
||||||
|
|
||||||
|
\item Consider how many nodes would need to be added/deleted to/from Graph 1 to transform it into Graph 2.
|
||||||
|
Take the insertion/deletion of a node to be one operation, ignoring the length of those nodes' edges.
|
||||||
|
Add the count of these operations to the overall dissimilarity score of the graphs.
|
||||||
|
|
||||||
|
\item For each pair of equivalent nodes in the two graphs, identify the equivalent edges by the ones that link to
|
||||||
|
equivalent nodes.
|
||||||
|
For each of these equivalent edges, get the absolute value of the difference between the two edge lengths.
|
||||||
|
Add this absolute value to the overall dissimilarity score of the graph.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
This gives a simple yet meaningful measure of the differences between the two graphs.
|
||||||
|
One way that this could be improved is to use some algorithm to calculate how to re-arrange the graph such that each
|
||||||
|
edge was the correct length to reflect the physical reality of potential distances between players.
|
||||||
|
|
||||||
|
\subsection{Alternative Algorithms}
|
||||||
|
There are, of course, other algorithms that could be employed to calculate the similarity of two graphs.
|
||||||
|
One such algorithm that could be applicable in this context is the Global Clustering Coefficient of the graph, which would tell us how likely nodes within that
|
||||||
|
graph are to form clusters, i.e. how likely it is that a node's neighbours are also connected to each other.
|
||||||
|
This would give us an idea of how spread out the team is on the pitch, which would likely be useful for certain types of analysis.
|
||||||
|
\\\\
|
||||||
|
My initial idea for comparing the similarity of two graphs was to represent them as vectors and use Cosine Similarity.
|
||||||
|
This was particularly appealing to me because it's a consistent and meaningful measure of the similarity of two vectors,
|
||||||
|
giving a score between $-1$ and $1$, and doesn't require re-inventing the wheel or any complex computations.
|
||||||
|
Furthermore, Cosine Similarity doesn't consider the magnitude of a vector, so we could consider the shape of the graph and
|
||||||
|
ignore the exact distance between nodes if our vectorisation approach was appropriate.
|
||||||
|
Even more appealing was the fact that Cosine Similarity can compare vectors with different numbers of elements, meaning
|
||||||
|
that we could compare a team with 11 players to one that was missing a player and only had 10.
|
||||||
|
Everything about the Cosine Similarity approach was extremely appealing to me, and I spent a lot of time trying to get it
|
||||||
|
to work, but I failed to define an algorithm that could translate a graph into a vector in a meaningful manner, such that
|
||||||
|
the direction of the vector in $N$ dimensions (with $N$ being the number of nodes in the graph) said something meaningful
|
||||||
|
about the overall shape of the graph.
|
||||||
|
I still feel that there is a lot of potential in the Cosine Similarity approach, if only I had a meaningful way of
|
||||||
|
representing a graph as a vector.
|
||||||
|
My attempts usually resulted in vectors that pointed in roughly the same direction regardless of the shape of the graph, as
|
||||||
|
I calculated the vectors by having one element per node, and each element to be the ``value'' of the node, defined by
|
||||||
|
$\mathrm{value}(v) = \alpha \cdot \mathrm{Degree}(v) - \sum^n_{i = 1}{d_i}$ where $v$ is a vertex in the graph,
|
||||||
|
$\alpha$ is some weight to emphasise the importance of the degree of the vertex, from which we subtract the sum of the
|
||||||
|
length of each \textbf{d}istance value from 1 to $n$ for an $n$-node graph.
|
||||||
|
The issue with this approach is that it generally generated quite similar vectors regardless of the actually similarity
|
||||||
|
of the graphs, much to my disappointment.
|
||||||
|
|
||||||
|
|
||||||
|
\section{Degree \& In-Betweenness}
|
||||||
|
\subsection{Calculate the Degree of Each Node}
|
||||||
|
Given a snapshot graph wherein edges shorter than some length $k$ are discarded, and the data structure outlined
|
||||||
|
in \textbf{\hyperref[sec:data_structure]{Representing the Data in a Data Structure}}, we can calculate the degree
|
||||||
|
of each node by iterating over the set of edges and incrementing a variable for each player when it is contained
|
||||||
|
by an \mintinline{java}{Edge} object.
|
||||||
|
This can be achieved with the following Java code:
|
||||||
|
|
||||||
|
\begin{code}
|
||||||
|
\begin{minted}[texcl, mathescape, linenos, breaklines, frame=single]{Java}
|
||||||
|
// Depending on how often we intend to do this calculation and how we intend to do it, it would likely be better to put this method in the `Graph` class.
|
||||||
|
// However, to keep in line with the simplicity of the classes as defined previously, I have opted to pass the Graph to the method as an argument rather than change the Graph class.
|
||||||
|
|
||||||
|
// Input: A snapshot Graph object.
|
||||||
|
// Output: A HashMap data structure in which the key is a Player (node) object and the value accessible by that key is the degree of that Player (node) object.
|
||||||
|
public HashMap<Player, Integer> calculateDegrees(Graph graph) {
|
||||||
|
HashMap<Player, Integer> returnValues = new HashMap<>();
|
||||||
|
|
||||||
|
// looping over each Edge in the Graph
|
||||||
|
for (Edge edge: graph.edges) {
|
||||||
|
// loop over each Player in the Edge's `nodes` Set
|
||||||
|
// there should really be no more than two, practically speaking, and no less than 2 assuming that we are not dealing with multigraphs but the number of nodes an edge joins doesn't matter from an algorithmic perspective
|
||||||
|
for (Player player : edge.nodes) {
|
||||||
|
// get the current degree count for the player if it is already defined
|
||||||
|
Integer degree = returnValues.get(player);
|
||||||
|
|
||||||
|
// if the degree is not yet defined, set it to 1
|
||||||
|
if (degree == null) {
|
||||||
|
degree = 1;
|
||||||
|
}
|
||||||
|
// otherwise increment it by 1
|
||||||
|
else {
|
||||||
|
degree++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// set the player's degree in returnValues to the updated value
|
||||||
|
returnValues.put(player, degree);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
\end{minted}
|
||||||
|
\caption{Java Code to Calculate the Degree of Each Node in a Graph Snapshot}
|
||||||
|
\end{code}
|
||||||
|
|
||||||
|
\subsection{Determine Which Node(s) is/are On the Most Paths}
|
||||||
|
Before we can determine which node is on the most paths, we must first define what we mean by that.
|
||||||
|
I will take a path in this context to be a finite sequence of nodes, with no repetition of nodes.
|
||||||
|
Here, I am making the assumption that each pair of nodes can only have one edge joining them, which is true for this kind of data.
|
||||||
|
I am also making the simplifying assumption that each path is a ``simple path'' and that a node cannot appear in a path twice, i.e. I am excluding cyclic paths.
|
||||||
|
I am also not considering single nodes to be a path, i.e. a path must contain more than one node to be considered a path.
|
||||||
|
We must note that since we are dealing with undirected graphs, the paths too are undirected, so a path $A \rightarrow B \rightarrow C$ is the same as the path
|
||||||
|
$C \rightarrow B \rightarrow A$.
|
||||||
|
\\\\
|
||||||
|
Firstly, we must find each extant simple path in the graph.
|
||||||
|
Since we are dealing only with simple paths, we know that no node can be repeated in a path, i.e. each path contains at most $N$ nodes, where $N$ is the
|
||||||
|
number of nodes in the graph.
|
||||||
|
We will represent a path as a Java \mintinline{java}{ArrayList} of \mintinline{java}{Player} objects.
|
||||||
|
Each path of length $i$ will be stored with the other paths of that length in an \mintinline{Java}{Set} of \mintinline{Java}{ArrayList}s, which is a data structure that allows no
|
||||||
|
duplicates and ignores any attempt to insert a duplicate object.
|
||||||
|
We are going to generate each path of length $i$ by appending nodes (where there is a joining edge) to paths of length $i-1$ that we have already generated.
|
||||||
|
Then we will have a \mintinline{java}{Set} of each \emph{directed} path in the graph, as we have not considered when generating the paths that a path $A \rightarrow B \rightarrow C \cong
|
||||||
|
C \rightarrow B \rightarrow A$.
|
||||||
|
Each \mintinline{java}{Set} containing each path of length $i$ actually contains double the amount of paths that it should, as it has two of each path: one forwards, one reversed.
|
||||||
|
There are two options for dealing with this:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Remove the duplicate paths somehow, most likely by looping over each one backwards.
|
||||||
|
While probably more technically correct \& robust, this seems like a lot of work.
|
||||||
|
|
||||||
|
\item Ignore the problem.
|
||||||
|
Since we know that there are two of each path in our sets, we know that when we count how many times a given \mintinline{java}{Player} object occurs in a path, we will get a
|
||||||
|
number twice the size that it should be, and we can just half it to get the correct value.
|
||||||
|
If we wanted to be even more lazy, we could get away with not even halving the count to get the correct value, as we just want to find the largest count relative to all the others,
|
||||||
|
and all the counts being off by a factor of $2$ doesn't affect our ability to do this.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
I have opted to ignore the problem.
|
||||||
|
My proposed algorithm to determine which node is on the greatest number of paths using the data structure proposed in \textbf{\nameref{sec:data_structure}} is as
|
||||||
|
follows:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Declare a \mintinline{java}{HashMap<Player, Integer>} where the key is the \mintinline{java}{Player} and the value stored is the number of times that \mintinline{java}{Player} occurs on
|
||||||
|
a path in the graph.
|
||||||
|
|
||||||
|
\item Then, we must generate the ``paths'' of length $1$.
|
||||||
|
We won't consider these paths for the calculation of which node is on the most paths, but we will use these to build up our paths of length $2$.
|
||||||
|
We will loop over each \mintinline{Java}{Edge} in the graph, and insert each of its \mintinline{java}{Player} nodes into their own \mintinline{java}{ArrayList} of length $1$,
|
||||||
|
which will then each by inserted into a \mintinline{java}{Set} containing all the ``paths'' of length $1$.
|
||||||
|
The \mintinline{java}{Set} will ensure that the single-node paths have no duplicates.
|
||||||
|
|
||||||
|
\item Loop from $i=2$ to $i=N$ building up paths of length $i$.
|
||||||
|
We will do this by looping over each path of length $i-1$ in a sub-loop, and looping over each \mintinline{java}{Edge} in the graph in a sub-sub-loop.
|
||||||
|
While having nested loops is usually a bad sign, I feel justified in making use of them as finding each path requires it to be traversed, meaning that this will be a costly
|
||||||
|
process no matter what.
|
||||||
|
For each path of length $i-1$, we will check if its final node is also one of the nodes in each \mintinline{java}{Edge} object.
|
||||||
|
If it is, we know that the \mintinline{java}{Edge} object's other node could be appended to this path to make a path of length $i$.
|
||||||
|
Before doing this however, we will check if this other node is already in the path (as we are not allowing duplicate nodes) by using the \mintinline{java}{contains()} method
|
||||||
|
(which is computationally equivalent to looping over the whole list and comparing each object it contains to the node we want to append).
|
||||||
|
If the node is already in the path, we ignore it; otherwise, we make a new \mintinline{java}{ArrayList} by appending the node (\mintinline{java}{Player}) to the path and inserting
|
||||||
|
it into the \mintinline{java}{Set} of all paths of length $i$.
|
||||||
|
Then we increment the count for that \mintinline{Java}{Player} object in the \mintinline{Java}{HashMap}.
|
||||||
|
|
||||||
|
\item Return the \mintinline{java}{Player} object(s) from the \mintinline{Java}{HashMap} that has/have the highest count.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsubsection{Java Code to Determine Which Node(s) is/are On the Most Paths}
|
||||||
|
\begin{code}
|
||||||
|
\begin{minted}[texcl, mathescape, linenos, breaklines, frame=single]{Java}
|
||||||
|
// Input: A snapshot Graph object
|
||||||
|
// Output: The a Set of Player objects that are on the most paths
|
||||||
|
public Set<Player> getMostInfluentialNode(Graph graph) {
|
||||||
|
HashMap<Player, Integer> counts = new HashMap<>();
|
||||||
|
|
||||||
|
// an ArrayList to hold the Sets of paths generated
|
||||||
|
ArrayList<Set<ArrayList<Player>>> setsOfPaths = new ArrayList<>();
|
||||||
|
|
||||||
|
// get a set of all the nodes in the graph
|
||||||
|
Set<Player> nodes = new HashSet<>();
|
||||||
|
for (Edge edge : graph.edges) {
|
||||||
|
for (Player player : edge.nodes) {
|
||||||
|
nodes.add(player);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// generate the "paths" of length 1
|
||||||
|
Set<ArrayList<Player>> oneNodePaths = new HashSet<>();
|
||||||
|
for (Player player : nodes) {
|
||||||
|
ArrayList<Player> path = new ArrayList<>();
|
||||||
|
path.add(player);
|
||||||
|
oneNodePaths.add(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
// loop from i = 2 to i = N building up paths of length i
|
||||||
|
for (int i = 2; i <= oneNodePaths.size(); i+=) {
|
||||||
|
// get the Set of paths of length i-1
|
||||||
|
Set<ArrayList<Player>> iMinusOneLengthPaths = setsOfPaths.get(i-1);
|
||||||
|
|
||||||
|
// create the Set of paths of length i
|
||||||
|
Set<ArrayList<Player>> iLengthPaths = new HashSet<>();
|
||||||
|
|
||||||
|
// loop over each path of length i-1
|
||||||
|
for (ArrayList<Player> path : iMinusOneLengthPaths) {
|
||||||
|
// loop over each edge in the graph
|
||||||
|
for (Edge edge : graph.edges) {
|
||||||
|
// check if the last node of the path is in the Edge
|
||||||
|
Player lastNode = path.get(path.size()-1);
|
||||||
|
|
||||||
|
// convert Set to Array so we can refer to the nodes by indices
|
||||||
|
// assuming here that each edge contains only two nodes -- more robust code would check for this
|
||||||
|
Player[] players = edge.nodes.toArray(new Player[2]);
|
||||||
|
|
||||||
|
// the code repetition here is not ideal, but it allows everything to be kept in one single method for readability
|
||||||
|
|
||||||
|
// if the 0th node of players is the same as lastNode, the 1st node of players can be appended
|
||||||
|
if (lastNode.equals(players[0])) {
|
||||||
|
// if players[1] is not already in the path
|
||||||
|
if (!path.contains(players[1])) {
|
||||||
|
// create new ArrayList to represent the path
|
||||||
|
ArrayList<Player> newPath = new ArrayList<>(path);
|
||||||
|
newPath.append(players[1]);
|
||||||
|
|
||||||
|
// increment the count for players[1]
|
||||||
|
if (counts.containsKey(players[1])) {
|
||||||
|
int newCount = counts.get(players[1])++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int newCount = 1;
|
||||||
|
}
|
||||||
|
counts.put(players[1], newCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// else if the 1st node of players is the same as lastNode, the 0th node of players can be appended
|
||||||
|
else if (lastNode.equals(players[1])) {
|
||||||
|
// if players[0] is not already in the path
|
||||||
|
if (!path.contains(players[0])) {
|
||||||
|
// create new ArrayList to represent the path
|
||||||
|
ArrayList<Player> newPath = new ArrayList<>(path);
|
||||||
|
newPath.append(players[0]);
|
||||||
|
|
||||||
|
// increment the count for players[0]
|
||||||
|
if (counts.containsKey(players[0])) {
|
||||||
|
int newCount = counts.get(players[0])++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int newCount = 1;
|
||||||
|
}
|
||||||
|
counts.put(players[0], newCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find the highest value in the hashmap
|
||||||
|
// will be double what it technically should be for an undirected graph but it doesn't matter
|
||||||
|
int highestCount = 0;
|
||||||
|
for (int count : counts.values) {
|
||||||
|
highestCount = (count > highestCount) ? count : highestCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set of Players with the highest counts to return
|
||||||
|
Set<Player> mostInfluential = new HashSet<>();
|
||||||
|
for (Player player : counts.keySet()) {
|
||||||
|
if (counts.get(player) == highestCount) {
|
||||||
|
mostInfluential.add(player);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return mostInfluential;
|
||||||
|
}
|
||||||
|
\end{minted}
|
||||||
|
\caption{Java Code to Determine Which Node(s) is/are On the Most Paths}
|
||||||
|
\end{code}
|
||||||
|
\end{document}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{players}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{player1}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{k}{TIMESTAMP}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{run}\PYG{p}{()}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,35 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\},codes={\catcode`\$=3\catcode`\^=7\catcode`\_=8\relax}]
|
||||||
|
\PYG{c+c1}{// Note that in Java, there can be only one `public` class per file}
|
||||||
|
\PYG{c+c1}{// Therefore, if this code were to be actually used, each class must be in its own `*.java` file}
|
||||||
|
|
||||||
|
\PYG{k+kd}{public}\PYG{+w}{ }\PYG{k+kd}{class} \PYG{n+nc}{Game}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// potential data fields that could be contained within the Game class}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{k+kt}{int}\PYG{+w}{ }\PYG{n}{gameId}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{n}{String}\PYG{+w}{ }\PYG{n}{homeTeam}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{n}{String}\PYG{+w}{ }\PYG{n}{awayTeam}\PYG{p}{;}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// HashMap of Graph objects encapsulated within the Game object}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{n}{HashMap}\PYG{o}{\PYGZlt{}}\PYG{n}{LocalDateTime}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{Graph}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{graphs}\PYG{p}{;}
|
||||||
|
\PYG{p}{\PYGZcb{}}
|
||||||
|
|
||||||
|
\PYG{k+kd}{public}\PYG{+w}{ }\PYG{k+kd}{class} \PYG{n+nc}{Graph}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{n}{LocalDateTime}\PYG{+w}{ }\PYG{n}{timeGenerated}\PYG{p}{;}\PYG{+w}{ }\PYG{c+c1}{// need to import java.time.LocalDateTime for this to work}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// set of Edge objects encapsulated within the Graph object}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{n}{Set}\PYG{o}{\PYGZlt{}}\PYG{n}{Edge}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{edges}\PYG{p}{;}
|
||||||
|
\PYG{p}{\PYGZcb{}}
|
||||||
|
|
||||||
|
\PYG{k+kd}{public}\PYG{+w}{ }\PYG{k+kd}{class} \PYG{n+nc}{Edge}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{k+kt}{float}\PYG{+w}{ }\PYG{n}{distance}\PYG{p}{;}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// set of Player objects encapsulated within the Edge object - there should be no more than 2}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{n}{Set}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{nodes}\PYG{p}{;}
|
||||||
|
|
||||||
|
\PYG{p}{\PYGZcb{}}
|
||||||
|
|
||||||
|
\PYG{k+kd}{public}\PYG{+w}{ }\PYG{k+kd}{class} \PYG{n+nc}{Player}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// potential data fields that would be contained within the Player class}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{n}{String}\PYG{+w}{ }\PYG{n}{name}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{k+kd}{public}\PYG{+w}{ }\PYG{k+kt}{int}\PYG{+w}{ }\PYG{n}{playerId}\PYG{p}{;}\PYG{+w}{ }\PYG{c+c1}{// assuming each player in the league has a unique ID}
|
||||||
|
\PYG{p}{\PYGZcb{}}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{distance\PYGZus{}id}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{graphs}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{time\PYGZus{}generated}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{start}\PYG{p}{()}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{ArrayList}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{ArrayList}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,13 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{k}{CREATE}\PYG{+w}{ }\PYG{k}{TABLE}\PYG{+w}{ }\PYG{n}{distances}\PYG{+w}{ }\PYG{p}{(}
|
||||||
|
\PYG{+w}{ }\PYG{n}{graph\PYGZus{}id}\PYG{+w}{ }\PYG{n+nb}{INT}\PYG{+w}{ }\PYG{k}{NOT}\PYG{+w}{ }\PYG{k}{NULL}\PYG{p}{,}
|
||||||
|
\PYG{+w}{ }\PYG{n}{player1}\PYG{+w}{ }\PYG{n+nb}{INT}\PYG{+w}{ }\PYG{k}{NOT}\PYG{+w}{ }\PYG{k}{NULL}\PYG{p}{,}
|
||||||
|
\PYG{+w}{ }\PYG{n}{player2}\PYG{+w}{ }\PYG{n+nb}{INT}\PYG{+w}{ }\PYG{k}{NOT}\PYG{+w}{ }\PYG{k}{NULL}\PYG{p}{,}
|
||||||
|
\PYG{+w}{ }\PYG{n}{distance}\PYG{+w}{ }\PYG{n+nb}{FLOAT}\PYG{p}{,}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{k}{PRIMARY}\PYG{+w}{ }\PYG{k}{KEY}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{graph\PYGZus{}id}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{player1}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{player2}\PYG{p}{),}
|
||||||
|
\PYG{+w}{ }\PYG{k}{FOREIGN}\PYG{+w}{ }\PYG{k}{KEY}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{graph\PYGZus{}id}\PYG{p}{)}\PYG{+w}{ }\PYG{k}{REFERENCES}\PYG{+w}{ }\PYG{n}{graphs}\PYG{p}{(}\PYG{n}{graph\PYGZus{}id}\PYG{p}{),}
|
||||||
|
\PYG{+w}{ }\PYG{k}{FOREIGN}\PYG{+w}{ }\PYG{k}{KEY}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{player1}\PYG{p}{)}\PYG{+w}{ }\PYG{k}{REFERENCES}\PYG{+w}{ }\PYG{n}{players}\PYG{p}{(}\PYG{n}{player\PYGZus{}id}\PYG{p}{),}
|
||||||
|
\PYG{+w}{ }\PYG{k}{FOREIGN}\PYG{+w}{ }\PYG{k}{KEY}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{player2}\PYG{p}{)}\PYG{+w}{ }\PYG{k}{REFERENCES}\PYG{+w}{ }\PYG{n}{players}\PYG{p}{(}\PYG{n}{player\PYGZus{}id}\PYG{p}{),}
|
||||||
|
\PYG{p}{)}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{distances}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{contains}\PYG{p}{()}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{Game}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,10 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{k}{CREATE}\PYG{+w}{ }\PYG{k}{TABLE}\PYG{+w}{ }\PYG{n}{graphs}\PYG{+w}{ }\PYG{p}{(}
|
||||||
|
\PYG{+w}{ }\PYG{n}{graph\PYGZus{}id}\PYG{+w}{ }\PYG{n+nb}{INT}\PYG{+w}{ }\PYG{k}{NOT}\PYG{+w}{ }\PYG{k}{NULL}\PYG{+w}{ }\PYG{n}{AUTO\PYGZus{}INCREMENT}\PYG{p}{,}
|
||||||
|
\PYG{+w}{ }\PYG{n}{game\PYGZus{}id}\PYG{+w}{ }\PYG{n+nb}{INT}\PYG{+w}{ }\PYG{k}{NOT}\PYG{+w}{ }\PYG{k}{NULL}\PYG{p}{,}
|
||||||
|
\PYG{+w}{ }\PYG{n}{time\PYGZus{}generated}\PYG{+w}{ }\PYG{k}{TIMESTAMP}\PYG{+w}{ }\PYG{k}{NOT}\PYG{+w}{ }\PYG{k}{NULL}\PYG{p}{,}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{k}{PRIMARY}\PYG{+w}{ }\PYG{k}{KEY}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{graph\PYGZus{}id}\PYG{p}{),}
|
||||||
|
\PYG{+w}{ }\PYG{k}{FOREIGN}\PYG{+w}{ }\PYG{k}{KEY}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{game\PYGZus{}id}\PYG{p}{)}\PYG{+w}{ }\PYG{k}{REFERENCES}\PYG{+w}{ }\PYG{n}{games}\PYG{p}{(}\PYG{n}{game\PYGZus{}id}\PYG{p}{)}\PYG{+w}{ }\PYG{c+c1}{\PYGZhy{}\PYGZhy{} assuming that a `graphs` table exists elsewhere}
|
||||||
|
\PYG{p}{)}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{Thread}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{player2}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{distance}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{Set}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{Set}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{HashMap}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{Integer}\PYG{o}{\PYGZgt{}}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{Player}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{Player}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,8 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{k}{CREATE}\PYG{+w}{ }\PYG{k}{TABLE}\PYG{+w}{ }\PYG{n}{players}\PYG{+w}{ }\PYG{p}{(}
|
||||||
|
\PYG{+w}{ }\PYG{n}{player\PYGZus{}id}\PYG{+w}{ }\PYG{n+nb}{INT}\PYG{+w}{ }\PYG{k}{NOT}\PYG{+w}{ }\PYG{k}{NULL}\PYG{+w}{ }\PYG{n}{AUTO\PYGZus{}INCREMENT}\PYG{p}{,}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{\PYGZhy{}\PYGZhy{} whatever other relevant information for each player should be included here}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{k}{PRIMARY}\PYG{+w}{ }\PYG{k}{KEY}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{player\PYGZus{}id}\PYG{p}{)}
|
||||||
|
\PYG{p}{)}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{nodes}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{Edge}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{Edge}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{graph\PYGZus{}id}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{games}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,32 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\},codes={\catcode`\$=3\catcode`\^=7\catcode`\_=8\relax}]
|
||||||
|
\PYG{c+c1}{// Depending on how often we intend to do this calculation and how we intend to do it, it would likely be better to put this method in the `Graph` class.}
|
||||||
|
\PYG{c+c1}{// However, to keep in line with the simplicity of the classes as defined previously, I have opted to pass the Graph to the method as an argument rather than change the Graph class.}
|
||||||
|
|
||||||
|
\PYG{c+c1}{// Input: A snapshot Graph object.}
|
||||||
|
\PYG{c+c1}{// Output: A HashMap data structure in which the key is a Player (node) object and the value accessible by that key is the degree of that Player (node) object.}
|
||||||
|
\PYG{k+kd}{public}\PYG{+w}{ }\PYG{n}{HashMap}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{Integer}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n+nf}{calculateDegrees}\PYG{p}{(}\PYG{n}{Graph}\PYG{+w}{ }\PYG{n}{graph}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{HashMap}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{Integer}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{returnValues}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{HashMap}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{();}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// looping over each Edge in the Graph}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{Edge}\PYG{+w}{ }\PYG{n}{edge}\PYG{p}{:}\PYG{+w}{ }\PYG{n}{graph}\PYG{p}{.}\PYG{n+na}{edges}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// loop over each Player in the Edge's `nodes` Set}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// there should really be no more than two, practically speaking, and no less than 2 assuming that we are not dealing with multigraphs but the number of nodes an edge joins doesn't matter from an algorithmic perspective}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{Player}\PYG{+w}{ }\PYG{n}{player}\PYG{+w}{ }\PYG{p}{:}\PYG{+w}{ }\PYG{n}{edge}\PYG{p}{.}\PYG{n+na}{nodes}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// get the current degree count for the player if it is already defined}
|
||||||
|
\PYG{+w}{ }\PYG{n}{Integer}\PYG{+w}{ }\PYG{n}{degree}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{n}{returnValues}\PYG{p}{.}\PYG{n+na}{get}\PYG{p}{(}\PYG{n}{player}\PYG{p}{);}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// if the degree is not yet defined, set it to 1}
|
||||||
|
\PYG{+w}{ }\PYG{k}{if}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{degree}\PYG{+w}{ }\PYG{o}{==}\PYG{+w}{ }\PYG{k+kc}{null}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{degree}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{l+m+mi}{1}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// otherwise increment it by 1}
|
||||||
|
\PYG{+w}{ }\PYG{k}{else}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{degree}\PYG{o}{++}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// set the player's degree in returnValues to the updated value}
|
||||||
|
\PYG{+w}{ }\PYG{n}{returnValues}\PYG{p}{.}\PYG{n+na}{put}\PYG{p}{(}\PYG{n}{player}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{degree}\PYG{p}{);}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{p}{\PYGZcb{}}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{HashMap}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,104 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\},codes={\catcode`\$=3\catcode`\^=7\catcode`\_=8\relax}]
|
||||||
|
\PYG{c+c1}{// Input: A snapshot Graph object}
|
||||||
|
\PYG{c+c1}{// Output: The a Set of Player objects that are on the most paths}
|
||||||
|
\PYG{k+kd}{public}\PYG{+w}{ }\PYG{n}{Set}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n+nf}{getMostInfluentialNode}\PYG{p}{(}\PYG{n}{Graph}\PYG{+w}{ }\PYG{n}{graph}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{HashMap}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{Integer}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{counts}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{HashMap}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{();}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// an ArrayList to hold the Sets of paths generated}
|
||||||
|
\PYG{+w}{ }\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}}\PYG{n}{Set}\PYG{o}{\PYGZlt{}}\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}\PYGZgt{}\PYGZgt{}}\PYG{+w}{ }\PYG{n}{setsOfPaths}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{();}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// get a set of all the nodes in the graph}
|
||||||
|
\PYG{+w}{ }\PYG{n}{Set}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{nodes}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{HashSet}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{();}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{Edge}\PYG{+w}{ }\PYG{n}{edge}\PYG{+w}{ }\PYG{p}{:}\PYG{+w}{ }\PYG{n}{graph}\PYG{p}{.}\PYG{n+na}{edges}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{Player}\PYG{+w}{ }\PYG{n}{player}\PYG{+w}{ }\PYG{p}{:}\PYG{+w}{ }\PYG{n}{edge}\PYG{p}{.}\PYG{n+na}{nodes}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{nodes}\PYG{p}{.}\PYG{n+na}{add}\PYG{p}{(}\PYG{n}{player}\PYG{p}{);}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// generate the "paths" of length 1}
|
||||||
|
\PYG{+w}{ }\PYG{n}{Set}\PYG{o}{\PYGZlt{}}\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}\PYGZgt{}}\PYG{+w}{ }\PYG{n}{oneNodePaths}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{HashSet}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{();}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{Player}\PYG{+w}{ }\PYG{n}{player}\PYG{+w}{ }\PYG{p}{:}\PYG{+w}{ }\PYG{n}{nodes}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{path}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{();}
|
||||||
|
\PYG{+w}{ }\PYG{n}{path}\PYG{p}{.}\PYG{n+na}{add}\PYG{p}{(}\PYG{n}{player}\PYG{p}{);}
|
||||||
|
\PYG{+w}{ }\PYG{n}{oneNodePaths}\PYG{p}{.}\PYG{n+na}{add}\PYG{p}{(}\PYG{n}{path}\PYG{p}{);}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// loop from i = 2 to i = N building up paths of length i}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{k+kt}{int}\PYG{+w}{ }\PYG{n}{i}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{l+m+mi}{2}\PYG{p}{;}\PYG{+w}{ }\PYG{n}{i}\PYG{+w}{ }\PYG{o}{\PYGZlt{}=}\PYG{+w}{ }\PYG{n}{oneNodePaths}\PYG{p}{.}\PYG{n+na}{size}\PYG{p}{();}\PYG{+w}{ }\PYG{n}{i}\PYG{o}{+=}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// get the Set of paths of length i-1}
|
||||||
|
\PYG{+w}{ }\PYG{n}{Set}\PYG{o}{\PYGZlt{}}\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}\PYGZgt{}}\PYG{+w}{ }\PYG{n}{iMinusOneLengthPaths}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{n}{setsOfPaths}\PYG{p}{.}\PYG{n+na}{get}\PYG{p}{(}\PYG{n}{i}\PYG{o}{\PYGZhy{}}\PYG{l+m+mi}{1}\PYG{p}{);}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// create the Set of paths of length i}
|
||||||
|
\PYG{+w}{ }\PYG{n}{Set}\PYG{o}{\PYGZlt{}}\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}\PYGZgt{}}\PYG{+w}{ }\PYG{n}{iLengthPaths}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{HashSet}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{();}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// loop over each path of length i-1}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{path}\PYG{+w}{ }\PYG{p}{:}\PYG{+w}{ }\PYG{n}{iMinusOneLengthPaths}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// loop over each edge in the graph}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{Edge}\PYG{+w}{ }\PYG{n}{edge}\PYG{+w}{ }\PYG{p}{:}\PYG{+w}{ }\PYG{n}{graph}\PYG{p}{.}\PYG{n+na}{edges}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// check if the last node of the path is in the Edge}
|
||||||
|
\PYG{+w}{ }\PYG{n}{Player}\PYG{+w}{ }\PYG{n}{lastNode}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{n}{path}\PYG{p}{.}\PYG{n+na}{get}\PYG{p}{(}\PYG{n}{path}\PYG{p}{.}\PYG{n+na}{size}\PYG{p}{()}\PYG{o}{\PYGZhy{}}\PYG{l+m+mi}{1}\PYG{p}{);}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// convert Set to Array so we can refer to the nodes by indices}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// assuming here that each edge contains only two nodes -- more robust code would check for this}
|
||||||
|
\PYG{+w}{ }\PYG{n}{Player}\PYG{o}{[]}\PYG{+w}{ }\PYG{n}{players}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{n}{edge}\PYG{p}{.}\PYG{n+na}{nodes}\PYG{p}{.}\PYG{n+na}{toArray}\PYG{p}{(}\PYG{k}{new}\PYG{+w}{ }\PYG{n}{Player}\PYG{o}{[}\PYG{l+m+mi}{2}\PYG{o}{]}\PYG{p}{);}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// the code repetition here is not ideal, but it allows everything to be kept in one single method for readability}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// if the 0th node of players is the same as lastNode, the 1st node of players can be appended}
|
||||||
|
\PYG{+w}{ }\PYG{k}{if}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{lastNode}\PYG{p}{.}\PYG{n+na}{equals}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{0}\PYG{o}{]}\PYG{p}{))}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// if players[1] is not already in the path}
|
||||||
|
\PYG{+w}{ }\PYG{k}{if}\PYG{+w}{ }\PYG{p}{(}\PYG{o}{!}\PYG{n}{path}\PYG{p}{.}\PYG{n+na}{contains}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{1}\PYG{o}{]}\PYG{p}{))}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// create new ArrayList to represent the path}
|
||||||
|
\PYG{+w}{ }\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{newPath}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{(}\PYG{n}{path}\PYG{p}{);}
|
||||||
|
\PYG{+w}{ }\PYG{n}{newPath}\PYG{p}{.}\PYG{n+na}{append}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{1}\PYG{o}{]}\PYG{p}{);}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// increment the count for players[1]}
|
||||||
|
\PYG{+w}{ }\PYG{k}{if}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{counts}\PYG{p}{.}\PYG{n+na}{containsKey}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{1}\PYG{o}{]}\PYG{p}{))}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{k+kt}{int}\PYG{+w}{ }\PYG{n}{newCount}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{n}{counts}\PYG{p}{.}\PYG{n+na}{get}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{1}\PYG{o}{]}\PYG{p}{)}\PYG{o}{++}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{k}{else}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{k+kt}{int}\PYG{+w}{ }\PYG{n}{newCount}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{l+m+mi}{1}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{counts}\PYG{p}{.}\PYG{n+na}{put}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{1}\PYG{o}{]}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{newCount}\PYG{p}{);}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// else if the 1st node of players is the same as lastNode, the 0th node of players can be appended}
|
||||||
|
\PYG{+w}{ }\PYG{k}{else}\PYG{+w}{ }\PYG{k}{if}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{lastNode}\PYG{p}{.}\PYG{n+na}{equals}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{1}\PYG{o}{]}\PYG{p}{))}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// if players[0] is not already in the path}
|
||||||
|
\PYG{+w}{ }\PYG{k}{if}\PYG{+w}{ }\PYG{p}{(}\PYG{o}{!}\PYG{n}{path}\PYG{p}{.}\PYG{n+na}{contains}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{0}\PYG{o}{]}\PYG{p}{))}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// create new ArrayList to represent the path}
|
||||||
|
\PYG{+w}{ }\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{newPath}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{ArrayList}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{(}\PYG{n}{path}\PYG{p}{);}
|
||||||
|
\PYG{+w}{ }\PYG{n}{newPath}\PYG{p}{.}\PYG{n+na}{append}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{0}\PYG{o}{]}\PYG{p}{);}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// increment the count for players[0]}
|
||||||
|
\PYG{+w}{ }\PYG{k}{if}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{counts}\PYG{p}{.}\PYG{n+na}{containsKey}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{0}\PYG{o}{]}\PYG{p}{))}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{k+kt}{int}\PYG{+w}{ }\PYG{n}{newCount}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{n}{counts}\PYG{p}{.}\PYG{n+na}{get}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{0}\PYG{o}{]}\PYG{p}{)}\PYG{o}{++}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{k}{else}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{k+kt}{int}\PYG{+w}{ }\PYG{n}{newCount}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{l+m+mi}{1}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{counts}\PYG{p}{.}\PYG{n+na}{put}\PYG{p}{(}\PYG{n}{players}\PYG{o}{[}\PYG{l+m+mi}{0}\PYG{o}{]}\PYG{p}{,}\PYG{+w}{ }\PYG{n}{newCount}\PYG{p}{);}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// find the highest value in the hashmap}
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// will be double what it technically should be for an undirected graph but it doesn't matter}
|
||||||
|
\PYG{+w}{ }\PYG{k+kt}{int}\PYG{+w}{ }\PYG{n}{highestCount}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{l+m+mi}{0}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{k+kt}{int}\PYG{+w}{ }\PYG{n}{count}\PYG{+w}{ }\PYG{p}{:}\PYG{+w}{ }\PYG{n}{counts}\PYG{p}{.}\PYG{n+na}{values}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{highestCount}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{count}\PYG{+w}{ }\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{highestCount}\PYG{p}{)}\PYG{+w}{ }\PYG{o}{?}\PYG{+w}{ }\PYG{n}{count}\PYG{+w}{ }\PYG{p}{:}\PYG{+w}{ }\PYG{n}{highestCount}\PYG{p}{;}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{c+c1}{// Set of Players with the highest counts to return}
|
||||||
|
\PYG{+w}{ }\PYG{n}{Set}\PYG{o}{\PYGZlt{}}\PYG{n}{Player}\PYG{o}{\PYGZgt{}}\PYG{+w}{ }\PYG{n}{mostInfluential}\PYG{+w}{ }\PYG{o}{=}\PYG{+w}{ }\PYG{k}{new}\PYG{+w}{ }\PYG{n}{HashSet}\PYG{o}{\PYGZlt{}\PYGZgt{}}\PYG{p}{();}
|
||||||
|
\PYG{+w}{ }\PYG{k}{for}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{Player}\PYG{+w}{ }\PYG{n}{player}\PYG{+w}{ }\PYG{p}{:}\PYG{+w}{ }\PYG{n}{counts}\PYG{p}{.}\PYG{n+na}{keySet}\PYG{p}{())}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{k}{if}\PYG{+w}{ }\PYG{p}{(}\PYG{n}{counts}\PYG{p}{.}\PYG{n+na}{get}\PYG{p}{(}\PYG{n}{player}\PYG{p}{)}\PYG{+w}{ }\PYG{o}{==}\PYG{+w}{ }\PYG{n}{highestCount}\PYG{p}{)}\PYG{+w}{ }\PYG{p}{\PYGZob{}}
|
||||||
|
\PYG{+w}{ }\PYG{n}{mostInfluential}\PYG{p}{.}\PYG{n+na}{add}\PYG{p}{(}\PYG{n}{player}\PYG{p}{);}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
\PYG{+w}{ }\PYG{p}{\PYGZcb{}}
|
||||||
|
|
||||||
|
\PYG{+w}{ }\PYG{k}{return}\PYG{+w}{ }\PYG{n}{mostInfluential}\PYG{p}{;}
|
||||||
|
\PYG{p}{\PYGZcb{}}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{game\PYGZus{}id}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{n}{Graph}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,76 @@
|
|||||||
|
|
||||||
|
\makeatletter
|
||||||
|
\def\PYG@reset{\let\PYG@it=\relax \let\PYG@bf=\relax%
|
||||||
|
\let\PYG@ul=\relax \let\PYG@tc=\relax%
|
||||||
|
\let\PYG@bc=\relax \let\PYG@ff=\relax}
|
||||||
|
\def\PYG@tok#1{\csname PYG@tok@#1\endcsname}
|
||||||
|
\def\PYG@toks#1+{\ifx\relax#1\empty\else%
|
||||||
|
\PYG@tok{#1}\expandafter\PYG@toks\fi}
|
||||||
|
\def\PYG@do#1{\PYG@bc{\PYG@tc{\PYG@ul{%
|
||||||
|
\PYG@it{\PYG@bf{\PYG@ff{#1}}}}}}}
|
||||||
|
\def\PYG#1#2{\PYG@reset\PYG@toks#1+\relax+\PYG@do{#2}}
|
||||||
|
|
||||||
|
\@namedef{PYG@tok@c}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cp}{\let\PYG@bf=\textbf\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cs}{\let\PYG@bf=\textbf\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@k}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kd}{\let\PYG@bf=\textbf\let\PYG@it=\textit}
|
||||||
|
\@namedef{PYG@tok@nb}{\let\PYG@bf=\textbf\let\PYG@it=\textit}
|
||||||
|
\@namedef{PYG@tok@bp}{\let\PYG@bf=\textbf\let\PYG@it=\textit}
|
||||||
|
\@namedef{PYG@tok@nn}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@nc}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@nf}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@nv}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@no}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@ow}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@s}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@err}{\def\PYG@bc##1{{\setlength{\fboxsep}{\string -\fboxrule}\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{\strut ##1}}}}
|
||||||
|
\@namedef{PYG@tok@kc}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kn}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kp}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kr}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kt}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@fm}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vc}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vg}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vi}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vm}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sa}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sb}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sc}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@dl}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sd}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@s2}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@se}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sh}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@si}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sx}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sr}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@s1}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@ss}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@ch}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cm}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cpf}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@c1}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
|
||||||
|
\def\PYGZbs{\char`\\}
|
||||||
|
\def\PYGZus{\char`\_}
|
||||||
|
\def\PYGZob{\char`\{}
|
||||||
|
\def\PYGZcb{\char`\}}
|
||||||
|
\def\PYGZca{\char`\^}
|
||||||
|
\def\PYGZam{\char`\&}
|
||||||
|
\def\PYGZlt{\char`\<}
|
||||||
|
\def\PYGZgt{\char`\>}
|
||||||
|
\def\PYGZsh{\char`\#}
|
||||||
|
\def\PYGZpc{\char`\%}
|
||||||
|
\def\PYGZdl{\char`\$}
|
||||||
|
\def\PYGZhy{\char`\-}
|
||||||
|
\def\PYGZsq{\char`\'}
|
||||||
|
\def\PYGZdq{\char`\"}
|
||||||
|
\def\PYGZti{\char`\~}
|
||||||
|
% for compatibility with earlier versions
|
||||||
|
\def\PYGZat{@}
|
||||||
|
\def\PYGZlb{[}
|
||||||
|
\def\PYGZrb{]}
|
||||||
|
\makeatother
|
||||||
|
|
Binary file not shown.
After Width: | Height: | Size: 52 KiB |
@ -0,0 +1,2 @@
|
|||||||
|
PWD /home/andrew/edu/third/semester1/CT3532: Database Systems II/assignments/assignment3/latex
|
||||||
|
INPUT /usr/local/texlive/2023/texmf-var/web2c/luahbtex/lualatex.fmt
|
51
third/semester1/CT3532: Database Systems II/exam_notes.txt
Normal file
51
third/semester1/CT3532: Database Systems II/exam_notes.txt
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
- 4 questions, do 3
|
||||||
|
- same type of questions as previous years
|
||||||
|
- overlap in terms of style with exercise sheets
|
||||||
|
- most questions have 3 parts with different marks
|
||||||
|
|
||||||
|
- question 1 is on design (first topic covered)
|
||||||
|
a) functional dependencies minimal cover sets (problem based)
|
||||||
|
b) normalisation to BCNF (decursive/descriptive)
|
||||||
|
c) denormalisation (decursive/descriptive)
|
||||||
|
|
||||||
|
- key things to know in general:
|
||||||
|
- functional dependency
|
||||||
|
- axioms to generate new functional dependencies
|
||||||
|
- minimal cover sets
|
||||||
|
- normal forms
|
||||||
|
- redundancy / denormalisation
|
||||||
|
|
||||||
|
- question 2: transactions
|
||||||
|
a) discuss one of problems (only 3 (think lost update problem etc not sure)) (supposedly easy)
|
||||||
|
b) concurrency control protocol to apply to above problem, such as timestamping, two phase locking etc.
|
||||||
|
c) distributed databases, how do we manage recovery or two phase locking or something
|
||||||
|
|
||||||
|
- problems that arise: concurrency control, lost update, etc
|
||||||
|
- serialisability, guarantee serialisability
|
||||||
|
- recovery, system log
|
||||||
|
- distributive databases
|
||||||
|
|
||||||
|
- question 3: indexing
|
||||||
|
a) b trees
|
||||||
|
b) dynamic indexes
|
||||||
|
c) multiattribute indexes / joint attribute indexes
|
||||||
|
|
||||||
|
- B tree, B+ tree (will be asked on b trees)
|
||||||
|
- hashing
|
||||||
|
- extendible / dynamic
|
||||||
|
- linear
|
||||||
|
- multi-attribute
|
||||||
|
- operators
|
||||||
|
|
||||||
|
|
||||||
|
- question 4: database models
|
||||||
|
a) parallelism
|
||||||
|
b) deductive / logic databases (not covered on past papers) (not too hard because we didn't spend long on it)
|
||||||
|
- store facts and relations
|
||||||
|
c) security (one of the models touched on 2023-11-14) (not covered on past papers)
|
||||||
|
- bell lepageol model or something of security
|
||||||
|
|
||||||
|
- old course was called ct332
|
||||||
|
- will get deductive / logic databases on old papers
|
||||||
|
|
||||||
|
- no more exam material after ./slides/DatabaseSecurity.pdf
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,750 @@
|
|||||||
|
%! TeX program = lualatex
|
||||||
|
\documentclass[a4paper,11pt]{article}
|
||||||
|
% packages
|
||||||
|
\usepackage{fontspec}
|
||||||
|
\setmainfont{EB Garamond}
|
||||||
|
% for tironian et fallback
|
||||||
|
% % \directlua{luaotfload.add_fallback
|
||||||
|
% % ("emojifallback",
|
||||||
|
% % {"Noto Serif:mode=harf"}
|
||||||
|
% % )}
|
||||||
|
% % \setmainfont{EB Garamond}[RawFeature={fallback=emojifallback}]
|
||||||
|
|
||||||
|
\setmonofont[Scale=MatchLowercase]{Deja Vu Sans Mono}
|
||||||
|
\usepackage[a4paper,left=2cm,right=2cm,top=\dimexpr15mm+1.5\baselineskip,bottom=2cm]{geometry}
|
||||||
|
\setlength{\parindent}{0pt}
|
||||||
|
|
||||||
|
\usepackage{fancyhdr} % Headers and footers
|
||||||
|
\fancyhead[R]{\normalfont \leftmark}
|
||||||
|
\fancyhead[L]{}
|
||||||
|
\pagestyle{fancy}
|
||||||
|
|
||||||
|
\usepackage{microtype} % Slightly tweak font spacing for aesthetics
|
||||||
|
\usepackage[english]{babel} % Language hyphenation and typographical rules
|
||||||
|
\usepackage[final, colorlinks = true, urlcolor = blue, linkcolor = black]{hyperref}
|
||||||
|
\usepackage{changepage} % adjust margins on the fly
|
||||||
|
\usepackage{amsmath}
|
||||||
|
|
||||||
|
\usepackage{minted}
|
||||||
|
\usemintedstyle{algol_nu}
|
||||||
|
\usepackage{xcolor}
|
||||||
|
\usepackage{algpseudocode}
|
||||||
|
|
||||||
|
\usepackage{tkz-graph}
|
||||||
|
\usetikzlibrary{positioning, fit, shapes.geometric}
|
||||||
|
\usepackage{pgfplots}
|
||||||
|
\pgfplotsset{width=\textwidth,compat=1.9}
|
||||||
|
|
||||||
|
\usepackage{caption}
|
||||||
|
\newenvironment{code}{\captionsetup{type=listing}}{}
|
||||||
|
|
||||||
|
\usepackage[yyyymmdd]{datetime}
|
||||||
|
\renewcommand{\dateseparator}{-}
|
||||||
|
|
||||||
|
\usepackage{titlesec}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\begin{titlepage}
|
||||||
|
\begin{center}
|
||||||
|
\hrule
|
||||||
|
\vspace*{0.6cm}
|
||||||
|
\huge \textbf{CT3532}
|
||||||
|
\vspace*{0.6cm}
|
||||||
|
\hrule
|
||||||
|
\LARGE
|
||||||
|
\vspace{0.5cm}
|
||||||
|
DATABASE SYSTEMS II
|
||||||
|
\vspace{0.5cm}
|
||||||
|
\hrule
|
||||||
|
|
||||||
|
\vfill
|
||||||
|
\includegraphics[width=0.7\textwidth]{images/db.png}
|
||||||
|
\vfill
|
||||||
|
|
||||||
|
\Large
|
||||||
|
\vspace{0.5cm}
|
||||||
|
\hrule
|
||||||
|
\vspace{0.5cm}
|
||||||
|
\textbf{Andreas Ó hAoḋa}
|
||||||
|
% \vspace{0.5cm}
|
||||||
|
% \hrule
|
||||||
|
% \vspace{0.5cm}
|
||||||
|
|
||||||
|
\normalsize
|
||||||
|
University of Galway
|
||||||
|
|
||||||
|
\today
|
||||||
|
|
||||||
|
\vspace{0.5cm}
|
||||||
|
\hrule
|
||||||
|
\end{center}
|
||||||
|
\end{titlepage}
|
||||||
|
|
||||||
|
\pagenumbering{roman}
|
||||||
|
\newpage
|
||||||
|
\tableofcontents
|
||||||
|
\newpage
|
||||||
|
\setcounter{page}{1}
|
||||||
|
\pagenumbering{arabic}
|
||||||
|
|
||||||
|
\section{Introduction}
|
||||||
|
\subsection{Recommended Texts}
|
||||||
|
\begin{itemize}
|
||||||
|
\item \emph{Fundamentals of Database Systems} by Elmasri and Navathe 005.74 ELM
|
||||||
|
\item \emph{Database system concepts} by Silberschatz, A. 005.74 SIL
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{Assessment}
|
||||||
|
Continuous Assessment accounts for 30\% of the final grade, and the exam accounts for the remaining 70\%.
|
||||||
|
Plagiarism of assignments is not permitted - This is strictly enforced.
|
||||||
|
|
||||||
|
\subsubsection{Assignments}
|
||||||
|
|
||||||
|
\section{Design}
|
||||||
|
\subsection{Re-cap}
|
||||||
|
\textbf{Normalisation} can be used to develop a normalised relational schema given the universal relation, and verify
|
||||||
|
the correctness of relational schema developed from conceptual design.
|
||||||
|
We decompose relations such that it satisfies successively restrictive normal forms.
|
||||||
|
\\\\
|
||||||
|
Desirable properties of a relational schema include:
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Clear semantics of a relation:} The \textbf{semantics} of a relation refers to how the attributes
|
||||||
|
grouped together in a relation are to be interpreted.
|
||||||
|
If ER modelling is done carefully and the mapping is undertaken correctly, it is likely that the semantics of
|
||||||
|
the resulting relation will be \emph{clear}.
|
||||||
|
One should try to design a relation so that it is easy to explain its meaning.
|
||||||
|
\item \textbf{Reducing the number of redundant values in tuples:} The presence of redundancy leads to waste of
|
||||||
|
storage space and potential for anomalies (deletion, update, insertion).
|
||||||
|
One should try to design relations so that no anomalies may occur. If an anomaly can occur, it should be noted.
|
||||||
|
Normalisation will remove many of the potential anomalies.
|
||||||
|
\item \textbf{Reducing the number of null values in tuples:} Having null values is often necessary, but it can
|
||||||
|
create problems, such as:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Wasted space.
|
||||||
|
\item Different interpretations, i.e.: attribute does not apply to this tuple, attribute value is
|
||||||
|
unknown, attribute value is known but absent, etc.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\item \textbf{Disallowing the possibility of generating spurious tuples:} If a relation $R$ is decomposed into $R_1$
|
||||||
|
\& $R_2$ and connected via a primary key -- foreign key pair, then performing an equi-join between $R_1$ \&
|
||||||
|
$R_2$ on the involved keys should not produce tuples that were not in the original relation $R$.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
More formally, we typically have a relation $R$ and a set of functional dependencies $F$, defined over $R$.
|
||||||
|
We wish to create a decomposition $D = \{R_1, R_2, \dots, R_n\}$
|
||||||
|
We wish to guarantee certain properties of this decomposition.
|
||||||
|
We require that all attributes in the original $R$ be maintained in the decomposition, \textit{id est}:
|
||||||
|
$R = R_1 \cup R_2 \cup \dots \cup R_n$
|
||||||
|
\begin{itemize}
|
||||||
|
\item A relation is said to be in the \textbf{First Normal Form (1NF)} if there are no repeating fields.
|
||||||
|
\item A relation is said to be in the \textbf{Second Normal Form (2NF)} if it is in 1NF and if every non-prime
|
||||||
|
attribute is fully functionally dependent on the key.
|
||||||
|
\item A relation is said to be in the \textbf{Third Normal Form (3NF)} if it is in 2NF and if no non-prime
|
||||||
|
attribute is transitively dependent on the key.
|
||||||
|
\item A relation is said to be in the \textbf{Boyce-Codd Normal Form (BCNF)} if the relation is in 3NF and if
|
||||||
|
every determinant is a candidate key.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsubsection{Example}
|
||||||
|
\begin{table}[H]
|
||||||
|
\centering
|
||||||
|
\begin{tabular}{lll}
|
||||||
|
\hline
|
||||||
|
StudentNo & Major & Advisor \\ \hline
|
||||||
|
123 & I.T. & Smith \\
|
||||||
|
123 & Econ & Murphy \\
|
||||||
|
444 & Biol. & O' Reilly \\
|
||||||
|
617 & I.T. & Jones \\
|
||||||
|
829 & I.T. & Smith \\ \hline
|
||||||
|
\end{tabular}
|
||||||
|
\caption{Sample Data}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
Constraints:
|
||||||
|
\begin{itemize}
|
||||||
|
\item A student may have more than one major.
|
||||||
|
\item For each major, a student can have only one advisor.
|
||||||
|
\item Each major can have several advisors.
|
||||||
|
\item Each advisor advises one major.
|
||||||
|
\item Each advisor can advise several students.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
Functional dependencies:
|
||||||
|
\begin{itemize}
|
||||||
|
\item \{StudentNo, Major\} {\rightarrow} \{Advisor\}
|
||||||
|
\item \{Advisor\} {\rightarrow} \{Major\}
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
An update anomaly may exist: If student 44 changes major, we lose information that O' Reilly supervises Biology.
|
||||||
|
To solve this, we can decompose the tables so as to satisfy BCNF:
|
||||||
|
\begin{itemize}
|
||||||
|
\item TAKES: \underline{StudentNo, Advisor}
|
||||||
|
\item ADVISES: \underline{Advisor}, Major
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsubsection{General Rule}
|
||||||
|
Consider a relation $R$ with functional dependencies $F$.
|
||||||
|
If $X \rightarrow Y$ violates BCNF, decompose $R$ into:
|
||||||
|
\begin{itemize}
|
||||||
|
\item $\{R - Y\}$
|
||||||
|
\item $\{XY\}$
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsubsection{Exercise}
|
||||||
|
Let $R = \{A, B, C, D, E, F, G, H\}$.
|
||||||
|
The functional dependencies defined over $R$ are:
|
||||||
|
\begin{itemize}
|
||||||
|
\item $A \rightarrow D$
|
||||||
|
\item $B \rightarrow E$
|
||||||
|
\item $E \rightarrow F$
|
||||||
|
\item $F \rightarrow G$
|
||||||
|
\item $F \rightarrow H$
|
||||||
|
\item $\{A, B\} \rightarrow C$
|
||||||
|
\item $C \rightarrow A$
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
|
||||||
|
\begin{tikzpicture}
|
||||||
|
\SetGraphUnit{2}
|
||||||
|
\SetUpEdge[style={->}]
|
||||||
|
\Vertices{circle}{A, B, C, D, E, F, G, H}
|
||||||
|
\node[ellipse, draw=black, fit=(A) (B), inner sep=-1mm] (AB) {};
|
||||||
|
|
||||||
|
\Edge(A)(D)
|
||||||
|
\Edge(B)(E)
|
||||||
|
\Edge(E)(F)
|
||||||
|
\Edge(F)(G)
|
||||||
|
\Edge(F)(H)
|
||||||
|
\Edge(AB)(C)
|
||||||
|
\Edge(C)(A)
|
||||||
|
\end{tikzpicture}
|
||||||
|
|
||||||
|
|
||||||
|
Decompose $R$ such that the BCNF is satisfied.
|
||||||
|
|
||||||
|
|
||||||
|
\section{Design by Synthesis}
|
||||||
|
\subsection{Background}
|
||||||
|
Typically, we have the relation $R$ and a set of functional dependencies $F$.
|
||||||
|
We wish to create a decomposition $D = R_1, R_2, \dots, R_m$.
|
||||||
|
Clearly, all attributes of $R$ must occur in at least one schema $R_i$, \textit{id est}: $U^{m}_{i=1} R_i = R$.
|
||||||
|
This is known as the \textbf{attribute preservation} constraint.
|
||||||
|
\\\\
|
||||||
|
A \textbf{functional dependency} is a constraint between two sets of attributes.
|
||||||
|
A functional dependency $X \rightarrow Y$ exists for all tuples $t_1$ \& $t_2$ if $t_1[X] = t_2[X]$, then $t_2$ if
|
||||||
|
$t_1[Y] = t_2[Y]$.
|
||||||
|
We usually only specify the obvious functional dependencies, there may be many more.
|
||||||
|
Given a set of functional dependencies $F$, the \textbf{closure of \textit{F}}, denoted $F^+$, refers to all dependencies that
|
||||||
|
can be derived from $F$.
|
||||||
|
|
||||||
|
\subsubsection{Armstrong's Axioms}
|
||||||
|
\textbf{Armstrong's Axioms} are a set of inference rules that allow us to deduce all functional dependencies from a given
|
||||||
|
initial set.
|
||||||
|
They are:
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Reflexivity:} if $X \supseteq Y$, then $X \rightarrow Y$.
|
||||||
|
\item \textbf{Augmentation:} if $X \rightarrow Y$, then $XZ \rightarrow YZ$.
|
||||||
|
\item \textbf{Transitivity:} if $X \rightarrow Y$, $Y \rightarrow Z$, then $X \rightarrow Z$.
|
||||||
|
\item \textbf{Projectivity:} if $X \rightarrow YZ$, then $X \rightarrow Z$.
|
||||||
|
\item \textbf{Additivity:} if $X \rightarrow Y$, $X \rightarrow Z$, then $X \rightarrow YZ$.
|
||||||
|
\item \textbf{Pseudo-transitivity:} if $X \rightarrow Y$, $WY \rightarrow Z$, then $WX \rightarrow Z$.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
The first three rules have been shown to be \textbf{sound} \& \textbf{complete}:
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Sound:} Given a set $F$ on a relation $R$, any dependency we can infer from $F$ using the first three
|
||||||
|
rules holds for every state of $r$ of $R$ that satisfies the dependencies in $F$.
|
||||||
|
\item \textbf{Complete:} We can use the first three rules repeatedly to infer all possible dependencies that can be
|
||||||
|
inferred from $F$.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
For any sets of attributes $A$, we can infer $A^+$, the set of attributes that are functionally determined by $A$ given
|
||||||
|
a set of functional dependencies.
|
||||||
|
|
||||||
|
% skipped page 8 here
|
||||||
|
\subsubsection{Cover Sets}
|
||||||
|
A set of functional dependencies $F$, \textbf{covers} a set of functional dependencies $E$ if every functional dependency
|
||||||
|
in $E$ is in $F^+$.
|
||||||
|
\\\\
|
||||||
|
Two sets of functional dependencies $E$ \& $F$ are equivalent if $E^+ = F^+$.
|
||||||
|
We can check if $F$ covers $E$ by calculating $A^+$ with respect to $F$ for each functional dependency $A \rightarrow B$
|
||||||
|
and then checking that $A^+$ includes all the attributes of $B$.
|
||||||
|
\\\\
|
||||||
|
A set of functional dependencies, $F$< is \textbf{minimal} if:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Every functional dependency in $F$ has a single attribute for its right-hand side.
|
||||||
|
\item We cannot remove any dependency from $F$ and maintain a set of dependencies equivalent to $F$.
|
||||||
|
\item We cannot replace any dependency $X \rightarrow A$ with a dependency $Y \rightarrow A$ where $Y \subset X$,
|
||||||
|
and still maintain a set of dependencies equivalent to $F$.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
All functional dependencies $X \rightarrow Y$ specified in $F$ should exist in one of the schema $R_i$, or should be
|
||||||
|
inferable from the dependencies in $R_i$; this is known as the \textbf{dependency preservation} constraint.
|
||||||
|
Each functional dependency specifies some constraint; if the dependency is absent, then some desired constraint is also
|
||||||
|
absent.
|
||||||
|
If a functional dependency is absent, then we must enforce the constraint in some other manner; this can be inefficient.
|
||||||
|
\\\\
|
||||||
|
Given $F$ \& $R$, the \textbf{projection} of $F$ on $R_i$, denoted $\pi_{R_i}(F)$ where $R_i$ is a subset of $R$, is the
|
||||||
|
set $X \rightarrow Y$ in $F^+$ such that attributes $X \cup Y \in R_i$.
|
||||||
|
A decomposition of $R$ is dependency-preserving if:
|
||||||
|
$$
|
||||||
|
((\pi_{R_1}(F)) \cup \dots \cup (\pi_{R_m}(F)))^+ = F^+
|
||||||
|
$$
|
||||||
|
|
||||||
|
\textbf{Theorem:}
|
||||||
|
It is always possible to find a decomposition $D$ with respect to $F$ such that:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item The decomposition is dependency-preserving.
|
||||||
|
\item All $R_i$ in $D$ are in 3NF.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
We can always guarantee a dependency-preserving decomposition to 3NF.
|
||||||
|
\textbf{Algorithm:}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Find a minimal cover set $G$ for $F$.
|
||||||
|
\item For each left-hand side $X$ of a functional dependency in $G$, create a relation
|
||||||
|
$X \cup A_1 \cup A_2 \dots A_m$ in $D$, where $X \rightarrow A_1$, $X \rightarrow A_2, \dots$ are the only
|
||||||
|
dependencies in $G$ with $X$ as a left-hand side.
|
||||||
|
\item Group any remaining attributes into a single relation.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsubsection{Lossless Joins}
|
||||||
|
Consider the following relation:
|
||||||
|
\begin{itemize}
|
||||||
|
\item EMPPROJ: \underline{ssn, pnumber}, hours, ename, pname, plocation
|
||||||
|
\end{itemize}
|
||||||
|
and its decomposition to:
|
||||||
|
\begin{itemize}
|
||||||
|
\item EMPPROJ1: \underline{ename, plocation}
|
||||||
|
\item EMPLOCAN: \underline{ssn, pno}, hrs, pname, plocation
|
||||||
|
\end{itemize}
|
||||||
|
If we perform a natural join on these relations, we may generate spurious tuples.
|
||||||
|
When a natural join is issued against relations, no spurious tuples should be generated.
|
||||||
|
A decomposition $D = \{R_1, R_2, \dots R_n\}$ of $R$ has the \textbf{lossless join} (or non-additive join) property with
|
||||||
|
regards to $F$ on $R$ if every instance $r$ of the following holds:
|
||||||
|
$$ \bowtie (\pi_{R_1}(r), \dots \pi_{R_m}(r)) = r $$
|
||||||
|
We can automate a procedure for testing for the lossless property.
|
||||||
|
We can also automate the decomposition of $R$ into $R_1, \dots R_m$ such that is possesses the lossless join property.
|
||||||
|
\\\\
|
||||||
|
|
||||||
|
A decomposition $D = \{R_1, R_2\}$ has the lossless property if \& only if:
|
||||||
|
\begin{itemize}
|
||||||
|
\item The functional dependency $(R_1 \cap R_2) \rightarrow \{R_1 - R_2\}$ is in $F^+$, \emph{or}
|
||||||
|
\item The functional dependency $(R_1 \cap R_2) \rightarrow \{R_2 - R_1\}$ is in $F^+$.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
Furthermore, if a decomposition has the lossless property and we decompose one of $R_i$ such that this also is a lossless
|
||||||
|
decomposition, then replacing that decomposition of $R_i$ in the original decomposition will result in a lossless
|
||||||
|
decomposition.
|
||||||
|
\\\\
|
||||||
|
\textbf{Algorithm:} To decompose to BCNF:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Let $D = R$.
|
||||||
|
\item While there is a schema $B$ in $D$ that violates BCNF, do:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Find the functional dependency $(X \rightarrow Y)$ that violates BCNF.
|
||||||
|
\item Replace $B$ with $(B-Y)$ \& $(X \cup Y)$.
|
||||||
|
\end{enumerate}
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
This guarantees a decomposition such that all attributes are preserved, the lossless join property is enforced,
|
||||||
|
and all $R_i$ are in BCNF.
|
||||||
|
It is not always possible to decompose $R$ into a set of $R_i$ such that all $R_i$ satisfy BCNF and properties
|
||||||
|
of lossless joins \& dependency preservation are maintained.
|
||||||
|
We can guarantee a decomposition such that:
|
||||||
|
\begin{itemize}
|
||||||
|
\item All attributes are preserved.
|
||||||
|
\item All relations are in 3NF.
|
||||||
|
\item All functional dependencies are maintained.
|
||||||
|
\item The lossless join property is maintained.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\textbf{Algorithm:} Finding a key for a relation schema $R$.
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Set $K := R$.
|
||||||
|
\item For each attribute $A \in K$, compute $(K-A)^+$ with respect to the set of functional dependencies.
|
||||||
|
If $(K-A)^+$ contains all the attributes in $R$, the set $K := K - \{A\}$.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
Given a set of functional dependencies $F$, we can develop a minimal cover set.
|
||||||
|
Using this, we can decompose $R$ into a set of relations such that all attributes are preserved, all functional
|
||||||
|
dependencies are preserved, the decomposition has the lossless join property, and all relations are in 3NF.
|
||||||
|
The advantages of this are that it provides a good database design and can be automated.
|
||||||
|
The primary disadvantage is that often, numerous good designs are possible.
|
||||||
|
|
||||||
|
\section{B Trees \& B+ Trees}
|
||||||
|
\subsection{Generalised Search Tree}
|
||||||
|
In a \textbf{Generalised Search Tree}, each node has the format $P_1, K_1, P_2, K_2, \dots, P_{n-1}, K_{n-1}, P_n$
|
||||||
|
where $P_i$ is a \textbf{tree value} and $K_i$ is a \textbf{search value}.
|
||||||
|
Hence, the number of values per node depends on the size of the key field, block size, \& block pointer size.
|
||||||
|
The following constraints hold:
|
||||||
|
\begin{itemize}
|
||||||
|
\item $K_1 < K_2 < \dots < K_{n-1} < K_n$.
|
||||||
|
\item For all values $x$ in a sub-tree pointed to $P_i$, $K_{i-1} < x < K_i$.
|
||||||
|
\item The number of tree pointers per node is known as the \textbf{order} or $\rho$ of the tree.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsubsection{Efficiency}
|
||||||
|
For a generalised search tree: $T(N) = O(\text{log}(N))$, assuming a balanced tree.
|
||||||
|
In order to guarantee this efficiency in searching \& other operations, we need techniques to ensure that the tree is
|
||||||
|
always balanced.
|
||||||
|
|
||||||
|
\subsection{B Trees}
|
||||||
|
A \textbf{B tree} is a balanced generalised search tree.
|
||||||
|
B trees can be viewed as a dynamic multi-level index.
|
||||||
|
The properties of a search tree still hold and the algorithms for insertion \& deletion of values are modified in order.
|
||||||
|
The node structure contains a record pointer for each key value.
|
||||||
|
The node structure is as follows:
|
||||||
|
$$
|
||||||
|
P_1 < K_1, Pr_1 > P_2 < K_2, Pr2 > \dots P_{n-1} < K_{n-1}, Pr_{n-1} > P_n
|
||||||
|
$$
|
||||||
|
|
||||||
|
\subsubsection{Example}
|
||||||
|
Consider a B Tree of order 3 (two values and three tree pointers per node/block).
|
||||||
|
Insert records with key values: 10, 6, 8, 14, 4, 16, 19, 11, 21.
|
||||||
|
|
||||||
|
\subsubsection{Algorithm to Insert a Value into a B Tree}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Find the appropriate leaf-level node to insert a value.
|
||||||
|
\item If space remains in the leaf-level node, then insert the new value in the correct location.
|
||||||
|
\item If no space remains, we need to deal with collisions.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsubsection{Dealing with Collisions}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Split node into left \& right nodes.
|
||||||
|
\item Propagate the middle value up a level and place its value in a node there. Note that this propagation may
|
||||||
|
cause further propagations and even the creation of a new root node.
|
||||||
|
\item Place the values less than the middle value in the left node.
|
||||||
|
\item Place the values greater than the middle value in the right node.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
This maintains the balanced nature of the tree, and $O(\text{log}_\rho(N))$ for search, insertion, \& deletion.
|
||||||
|
However, there is always potential for unused space in the tree.
|
||||||
|
Empirical analysis has shown that B trees remain 69\% full given random insertions \& deletions.
|
||||||
|
|
||||||
|
\subsubsection{Exercise}
|
||||||
|
Can you define an algorithm for deletion (at a high level)?
|
||||||
|
How much work is needed in the various cases (best, average, worst)?
|
||||||
|
|
||||||
|
\subsection{B+ Trees}
|
||||||
|
The most commonly used index type is the \textbf{B+ tree} -- a dynamic, multi-level index.
|
||||||
|
B+ trees differ from B trees in terms of structure, and have slightly more complicated insertion \& deletion algorithms.
|
||||||
|
B+ trees offer increased efficiency over a B Tree and ensures a higher order \rho.
|
||||||
|
|
||||||
|
\subsubsection{Node Structure}
|
||||||
|
B+ trees have two different node structures: internal nodes \& leaf-level nodes.
|
||||||
|
The internal node structure is:
|
||||||
|
$$P_1, K_1, P_2, K_2, \dots P_{n-1}, K_{n-1}, P_n$$
|
||||||
|
All record pointers are maintained at the leaf level in a B+ tree.
|
||||||
|
There are no record pointers in the internal nodes.
|
||||||
|
B+ trees have less information per record and hence more search values per node.
|
||||||
|
\\\\
|
||||||
|
One tree pointer is maintained at each leaf-level node, which points to the next leaf-level node.
|
||||||
|
Note that there is only one tree pointer per node at the leaf level.
|
||||||
|
Each leaf-level node's structure is:
|
||||||
|
$$K_1, Pr_1, K_2, PR_2, \dots K_m, Pr_m, P_{\text{next}}$$
|
||||||
|
|
||||||
|
\subsubsection{Example}
|
||||||
|
Let $B = 512$, $P = 6$, $K = 10$.
|
||||||
|
Assume 30,000 records as before.
|
||||||
|
Assume that the tree is 69\% full.
|
||||||
|
How many blocks will the tree require?
|
||||||
|
How many block accesses will a search require?
|
||||||
|
|
||||||
|
\subsubsection{Example}
|
||||||
|
A tree of order $\rho$ has at most $\rho - 1$ search values per node.
|
||||||
|
For a B+ tree, there are two types of tree nodes; hence there are two different orders: $\rho$ \& $\rho_{\text{leaf}}$.
|
||||||
|
To calculate $\rho_{\text{leaf}}$:
|
||||||
|
$$ |P| + (\rho_{\text{leaf}})(|K| + |Pr|) \leq B $$
|
||||||
|
$$ \rightarrow 17(\rho_{\text{leaf}}) \leq 506 $$
|
||||||
|
$$ \rho_{\text{leaf}} = 29 $$
|
||||||
|
|
||||||
|
Given a fill factor of 69\%:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Each internal node will have, on average, 22 pointers.
|
||||||
|
\item Each leaf-level node will have, on average, 20 pointers.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item Root: 1 node, 21 entries, 22 pointers.
|
||||||
|
\item Level 1: 22 nodes, 462 entries, 484 pointers.
|
||||||
|
\item Level 2: 484 nodes, \dots, etc.
|
||||||
|
\item Leaf Level: \dots
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
Hence, 4 levels are sufficient.
|
||||||
|
The number of block accesses $= 4 + 1$.
|
||||||
|
The number of blocks is $1 + 22 + 484 + \dots$
|
||||||
|
|
||||||
|
\section{Hash Tables}
|
||||||
|
\subsection{Introduction}
|
||||||
|
Can we improve upon logarithmic searching?
|
||||||
|
\textbf{Hashing} is a technique that attempts to provide constant time for searching \& insertion, i.e. $O(K)$.
|
||||||
|
The basic idea for searching \& insertion is to apply a hash function to the search field of the record;
|
||||||
|
the return value of the hash function is used to reference a location in the hash table.
|
||||||
|
\\\\
|
||||||
|
Care should be taken in designing a hash function.
|
||||||
|
We usually require a \textbf{fair} hash function.
|
||||||
|
This is difficult to guarantee if there is no or limited information available about the type of data to be
|
||||||
|
stored.
|
||||||
|
Often, heuristics can be used if domain knowledge is available.
|
||||||
|
We can have both internal (i.e., some data structure in memory) or external hashing (i.e., to file locations).
|
||||||
|
We must consider the size of the original table or file.
|
||||||
|
|
||||||
|
\subsection{Approaches}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Create a hash table containing $N$ addressable ``slots'' which each can contain one record.
|
||||||
|
\item Create a hash function that returns a value to be used in insertion \& searching.
|
||||||
|
The value returned by the hash function must be in the correct range, i.e. the address space of the
|
||||||
|
hash table
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
If the range of the keys is that of the address space of the table, we can guarantee constant-time lookup.
|
||||||
|
However, this is usually not the case as the address space of the table is much smaller than that of the search field.
|
||||||
|
\\\\
|
||||||
|
With numeric keys, we can use modulo-division or truncation for hashing.
|
||||||
|
With character keys, we must first convert to an integer value: this can be achieved by multiplying the ASCII
|
||||||
|
code of the characters together and then applying modulo-division.
|
||||||
|
However, we cannot guarantee constant-time performance as collisions will occur, i.e. two records with different search
|
||||||
|
values being hashed to the same location in the table; we require a collision resolution policy.
|
||||||
|
Efficiency then will depend on the number of collisions.
|
||||||
|
The number of collisions depends primarily on the load factor $\lambda$ of the file:
|
||||||
|
$$
|
||||||
|
\lambda = \frac{\text{Number of records}}{\text{Number of slots}}
|
||||||
|
$$
|
||||||
|
|
||||||
|
\subsubsection{Collision Resolution Policies}
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Chaining:} if a location is full, add the item to a linked list.
|
||||||
|
Performance degrades if the load factor is high.
|
||||||
|
The lookup time is on average $1 + \lambda$.
|
||||||
|
\item \textbf{Linear probing:} if a location is full, then check in a linear manner for the next free space.
|
||||||
|
This can degrade to a linear scan.
|
||||||
|
The performance, if successful is $0.5(1 + \frac{1}{1-\lambda})$ and if unsuccessful is
|
||||||
|
$0.5 + (1 + \frac{1}{1 + \lambda}^2)$.
|
||||||
|
One big disadvantage of this approach is that it leads to the formation of clusters.
|
||||||
|
\item \textbf{Quadratic probing:} if a location is full, check the location $x + 1$, location $x + 4$,
|
||||||
|
location $(x + n)^2$.
|
||||||
|
This results in less clustering.
|
||||||
|
\item \textbf{Double hashing:} if location $x$ is occupied, then apply a second hash function.
|
||||||
|
This can help guarantee an even distribution (a fairer hash function).
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{Dynamic Hashing}
|
||||||
|
The cases that we've considered thus far deal with the idea of a \textbf{fixed hash table}: this is referred to
|
||||||
|
as \textbf{static hashing}.
|
||||||
|
Problems arise if the database grows larger than planned: too many overflow buckets and performance degrades.
|
||||||
|
A more suitable approach is \textbf{dynamic hashing}, wherein the table or file can be re-sized as needed.
|
||||||
|
|
||||||
|
\subsubsection{General Approach}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Use a family of hash functions $h_0$, $h_1$, $h_2$, etc.
|
||||||
|
$h_{i+1}$ is a refinement of $h_i$.
|
||||||
|
E.g., $K \text{mod} 2^i$.
|
||||||
|
\item Develop a base hash function that maps the key to a positive integer.
|
||||||
|
\item Use $h_0(x) = x \text{mod} 2^b$ for a chosen $b$.
|
||||||
|
There will be $2^b$ buckets initially.
|
||||||
|
We can effectively double the size of the table by incrementing $b$.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
We only double the number of buckets when re-organising conceptually; we do not actually double the number of
|
||||||
|
buckets in practice as it may not be needed.
|
||||||
|
|
||||||
|
\subsection{Dynamic Hashing Approaches}
|
||||||
|
Common dynamic hashing approaches include extendible hashing \& linear hashing.
|
||||||
|
\textbf{Extendible hashing} involves re-organising the buckets when \& where needed, whereas
|
||||||
|
\textbf{linear hashing} involves re-organising buckets when but not where needed.
|
||||||
|
|
||||||
|
\subsubsection{Extendible Hashing}
|
||||||
|
\begin{itemize}
|
||||||
|
\item When a bucket overflows, split that bucket in two. A directory is used to achieve this conceptual
|
||||||
|
doubling.
|
||||||
|
\item If a collision or overflow occurs, we don't re-organise the file by doubling the number of buckets,
|
||||||
|
as this would be too expensive.
|
||||||
|
Instead, we maintain a directory of pointers to buckets, we can effectively double the number of
|
||||||
|
buckets by doubling the directory, splitting just the bucket that overflowed.
|
||||||
|
Doubling the directory is much cheaper than doubling the file, as the directory is much smaller than
|
||||||
|
the file,
|
||||||
|
\item On overflow, we split the bucket by allocating a new bucket and redistributing its contents.
|
||||||
|
We double the directory size if necessary.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
We maintain a \textbf{local depth} for each bucket, effectively the number of bits needed to hash an item here.
|
||||||
|
We also maintain a \textbf{global depth} for the directory which is the number of bits used in indexing items.
|
||||||
|
We can use these values to determine when to split the directory.
|
||||||
|
\begin{itemize}
|
||||||
|
\item If overflow occurs in a bucket where the local depth = global depth, then split the bucket,
|
||||||
|
redistribute its contents, and double the directory.
|
||||||
|
\item If overflow occurs in a bucket where the local depth < global depth, then split the bucket,
|
||||||
|
redistribute its contents, and increase the local depth.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
If the directory can fit in the memory, then the retrieval for point queries can be achieved with one disk read.
|
||||||
|
|
||||||
|
\subsection{Linear Hashing}
|
||||||
|
\textbf{Linear hashing} is another approach to indexing to a dynamic file.
|
||||||
|
It is similar to dynamic hashing in that a family of hash functions are used ($h = K \text{mod} 2^i$), but
|
||||||
|
differs in that no index is needed.
|
||||||
|
Initially, we create a file of $M$ buckets; $K \text{mod} M^1$ is a suitable hash function.
|
||||||
|
We will use a family of such functions $K \text{mod} (2^i \times M), i = 0$ initially.
|
||||||
|
We can view the hashing as comprising of a sequence of phases: for phase $j$, the hash functions
|
||||||
|
$K \text{mod} 2^i \times M$ \& $K \text{mod} 2^{j+1} \times M$ are used.
|
||||||
|
\\\\
|
||||||
|
We \textbf{split a bucket} by redistributing the records into two buckets: the original one \& a new one.
|
||||||
|
In phase $j$, to determine which ones go into the original while the others go into the new one, we use
|
||||||
|
$H_{j+1}(K) = K \text{mod} 2^{j+1} \times M$ to calculate their address.
|
||||||
|
Irrespective of the bucket which causes the overflow, we always split the next bucket in a \textbf{linear order}.
|
||||||
|
We begin with bucket 0, and keep track of which bucket to split next $p$.
|
||||||
|
At the end of a phase when $p$ is equal to the number of buckets present at the start of the phrase, we reset $p$
|
||||||
|
and a new phase begins ($j$ is incremented).
|
||||||
|
|
||||||
|
\section{Joins}
|
||||||
|
Many approaches \& algorithms can be used to do \textbf{joins}.
|
||||||
|
|
||||||
|
\subsection{Nested Loop Join}
|
||||||
|
To perform the join $r \bowtie s$:
|
||||||
|
\begin{minted}[texcl, mathescape, linenos, breaklines, frame=single]{text}
|
||||||
|
for each tuple t_r in r do:
|
||||||
|
for each tuple t_s in s do:
|
||||||
|
if t_r and t_ satisfy join condition:
|
||||||
|
add(t_r, t_s) to result
|
||||||
|
end
|
||||||
|
end
|
||||||
|
\end{minted}
|
||||||
|
|
||||||
|
This is an expensive approach; every pair of tuples is checked to see if they satisfy the join condition.
|
||||||
|
If one of the relations fits in memory, it is beneficial to use this in the inner loop (known as the
|
||||||
|
\textbf{inner relation}).
|
||||||
|
|
||||||
|
\subsection{Block nested Loop Join}
|
||||||
|
The \textbf{block nested loop join} is a variation on the nested loop join that increases efficiency by
|
||||||
|
reducing the number of block accesses.
|
||||||
|
\begin{minted}[texcl, mathescape, linenos, breaklines, frame=single]{text}
|
||||||
|
for each block B_r in r do:
|
||||||
|
for each block B_s in s do:
|
||||||
|
for each tuple t_r in B_r do:
|
||||||
|
for each tuple t_s in B_s do:
|
||||||
|
if t_r and t_s satisfy join condition:
|
||||||
|
add (t_r, t_s) to result
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
\end{minted}
|
||||||
|
|
||||||
|
\subsection{Indexed Nested Loop Join}
|
||||||
|
If there is an index available for the inner table in a nested loop join, we can replace file scans with index
|
||||||
|
accesses.
|
||||||
|
|
||||||
|
\subsection{Merge Join}
|
||||||
|
If both relations are sorted on the joining attribute, then we can merge the relations.
|
||||||
|
The technique is identical to merging two sorted lists (such as in the ``merge''' step of a Merge-Sort algorithm).
|
||||||
|
Merge joins are much more efficient than a nested join.
|
||||||
|
They can also be computed for relations that are not ordered on a joining attribute, but have indexes on the joining
|
||||||
|
attribute.
|
||||||
|
|
||||||
|
\subsection{Hash Joins}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Create a hashing function which maps the join attribute(s) to partitions in a range $1 .. N$.
|
||||||
|
\item For all tuples in $r$, hash the tuples to $H_{ri}$.
|
||||||
|
\item For all tuples in $s$, hash the tuples to $H_{si}$.
|
||||||
|
\item For $i = 1$ to $N$, join the partitions $H_{ri} = H_{si}$.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\section{Sorting}
|
||||||
|
Sorting is a very important operation because it is used if a query specifies \mintinline{SQL}{ORDER BY} and is
|
||||||
|
used prior to relational operators (e.g. Join) to allow more efficient processing of the operation.
|
||||||
|
We can sort a relation in two ways:
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Physically:} actual order of tuples re-arranged on the disk.
|
||||||
|
\item \textbf{Logically:} build an index and sort index entries.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
When the relation to be sorted fits in the memory, we can use standard sorting techniques (such as Quicksort).
|
||||||
|
However, when the relation doesn't fit in memory, we have to use other approaches such as the \textbf{external
|
||||||
|
sort merge}, which is essentially an $N$-way merge, an extension of the idea in the merge step of the merge sort
|
||||||
|
algorithm.
|
||||||
|
|
||||||
|
\begin{minted}[texcl, mathescape, linenos, breaklines, frame=single]{text}
|
||||||
|
i := 0;
|
||||||
|
M = number of page frames in main memory buffer
|
||||||
|
|
||||||
|
repeat
|
||||||
|
read M blocks of the relation
|
||||||
|
sort M blocks in memory
|
||||||
|
write sorted data to file R_i
|
||||||
|
until end of relation
|
||||||
|
|
||||||
|
read first block of each R_i into memory
|
||||||
|
repeat
|
||||||
|
choose first (in sort order) from pages
|
||||||
|
write tuple to output
|
||||||
|
remove tuple from buffer
|
||||||
|
if any buffer R_i empty and not eof(R_i)
|
||||||
|
read next block from R_i into memory
|
||||||
|
until all pages empty
|
||||||
|
\end{minted}
|
||||||
|
|
||||||
|
\section{Parallel Databases}
|
||||||
|
Characteristics of \textbf{Parallel databases} include:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Increased transaction requirements.
|
||||||
|
\item Increased volumes of data, particularly in data-warehousing.
|
||||||
|
\item Many queries lend themselves easily to parallel execution.
|
||||||
|
\item Can reduce the time required to retrieve relations from disk by partitioning relations onto a set of
|
||||||
|
disks.
|
||||||
|
\item Horizontal partitioning is usually used.
|
||||||
|
Subsets of a relation are sent to different disks.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{Query Types}
|
||||||
|
Common types of queries include:
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Batch processing:} scanning an entire relation.
|
||||||
|
\item \textbf{Point-Queries:} return all tuples that match some value.
|
||||||
|
\item \textbf{Range-Queries:} return all tuples with some value in some range.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{Partitioning Approaches}
|
||||||
|
\subsubsection{Round Robin}
|
||||||
|
With \textbf{Round Robin}, the relation is scanned in order.
|
||||||
|
Assuming $n$ disks, the $i^\text{th}$ relation is sent to disk $D_i\text{mod}n$.
|
||||||
|
Round Robin guarantees an even distribution
|
||||||
|
\\\\
|
||||||
|
Round Robin is useful for batch processing, but is not very suitable for either point or range queries as all
|
||||||
|
disks have to be accessed.
|
||||||
|
|
||||||
|
\subsubsection{Hash Partitioning}
|
||||||
|
In \textbf{hash partitioning}, we choose attributes to act as partitioning attributes.
|
||||||
|
We define a hash function with range $0 \dots n-1$, assuming $n$ disks.
|
||||||
|
Each tuple is placed according to the result of the hash function.
|
||||||
|
\\\\
|
||||||
|
Hash partitioning is very useful if a point query is based on a partitioning attribute.
|
||||||
|
It is usually useful for batch querying if a fair hash function is used, but is poor for range querying.
|
||||||
|
|
||||||
|
\subsubsection{Range Partitioning}
|
||||||
|
In \textbf{range partitioning}, a partitioning attribute is first chosen.
|
||||||
|
The partitioning vector is defined as $< v_0, v_1, \dots, v_{n-2} >$.
|
||||||
|
Tuples are placed according to the value of the partitioning attribute.
|
||||||
|
If $t_\text{partitioning attribute} < v_0$, then we place tuple $t$ on disk $D_0$.
|
||||||
|
\\\\
|
||||||
|
Range partitioning is useful for both point \& range querying, but can lead to inefficiency in range querying
|
||||||
|
if many tuples satisfy the condition.
|
||||||
|
|
||||||
|
\subsection{Types of Parallelism}
|
||||||
|
\subsubsection{Inter-Query Parallelism}
|
||||||
|
In \textbf{inter-query parallelism}, different transactions run in parallel on different processors, thus
|
||||||
|
increasing the transaction throughput, although the times for individual queries remain the same.
|
||||||
|
Inter-query parallelism is the easiest for of parallelism to implement.
|
||||||
|
|
||||||
|
\subsubsection{Intra-Query Parallelism}
|
||||||
|
\textbf{Intra-query parallelism} allows us to run a single query in parallel on multiple processors (\& disks),
|
||||||
|
which can speed up the running time of a query.
|
||||||
|
Parallel execution can be achieved by parallelising individual components, which is called
|
||||||
|
\textbf{intra-operation parallelism}.
|
||||||
|
Parallel execution can also be achieved by evaluating portions of the query in parallel, which is called
|
||||||
|
\textbf{inter-operation parallelism}.
|
||||||
|
Both of these types can be combined.
|
||||||
|
|
||||||
|
\end{document}
|
@ -0,0 +1,19 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\},codes={\catcode`\$=3\catcode`\^=7\catcode`\_=8\relax}]
|
||||||
|
i := 0;
|
||||||
|
M = number of page frames in main memory buffer
|
||||||
|
|
||||||
|
repeat
|
||||||
|
read M blocks of the relation
|
||||||
|
sort M blocks in memory
|
||||||
|
write sorted data to file R\PYGZus{}i
|
||||||
|
until end of relation
|
||||||
|
|
||||||
|
read first block of each R\PYGZus{}i into memory
|
||||||
|
repeat
|
||||||
|
choose first (in sort order) from pages
|
||||||
|
write tuple to output
|
||||||
|
remove tuple from buffer
|
||||||
|
if any buffer R\PYGZus{}i empty and not eof(R\PYGZus{}i)
|
||||||
|
read next block from R\PYGZus{}i into memory
|
||||||
|
until all pages empty
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,12 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\},codes={\catcode`\$=3\catcode`\^=7\catcode`\_=8\relax}]
|
||||||
|
for each block B\PYGZus{}r in r do:
|
||||||
|
for each block B\PYGZus{}s in s do:
|
||||||
|
for each tuple t\PYGZus{}r in B\PYGZus{}r do:
|
||||||
|
for each tuple t\PYGZus{}s in B\PYGZus{}s do:
|
||||||
|
if t\PYGZus{}r and t\PYGZus{}s satisfy join condition:
|
||||||
|
add (t\PYGZus{}r, t\PYGZus{}s) to result
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,8 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\},codes={\catcode`\$=3\catcode`\^=7\catcode`\_=8\relax}]
|
||||||
|
for each tuple t\PYGZus{}r in r do:
|
||||||
|
for each tuple t\PYGZus{}s in s do:
|
||||||
|
if t\PYGZus{}r and t\PYGZus{} satisfy join condition:
|
||||||
|
add(t\PYGZus{}r, t\PYGZus{}s) to result
|
||||||
|
end
|
||||||
|
end
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,3 @@
|
|||||||
|
\begin{Verbatim}[commandchars=\\\{\}]
|
||||||
|
\PYG{k}{ORDER}\PYG{+w}{ }\PYG{k}{BY}
|
||||||
|
\end{Verbatim}
|
@ -0,0 +1,76 @@
|
|||||||
|
|
||||||
|
\makeatletter
|
||||||
|
\def\PYG@reset{\let\PYG@it=\relax \let\PYG@bf=\relax%
|
||||||
|
\let\PYG@ul=\relax \let\PYG@tc=\relax%
|
||||||
|
\let\PYG@bc=\relax \let\PYG@ff=\relax}
|
||||||
|
\def\PYG@tok#1{\csname PYG@tok@#1\endcsname}
|
||||||
|
\def\PYG@toks#1+{\ifx\relax#1\empty\else%
|
||||||
|
\PYG@tok{#1}\expandafter\PYG@toks\fi}
|
||||||
|
\def\PYG@do#1{\PYG@bc{\PYG@tc{\PYG@ul{%
|
||||||
|
\PYG@it{\PYG@bf{\PYG@ff{#1}}}}}}}
|
||||||
|
\def\PYG#1#2{\PYG@reset\PYG@toks#1+\relax+\PYG@do{#2}}
|
||||||
|
|
||||||
|
\@namedef{PYG@tok@c}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cp}{\let\PYG@bf=\textbf\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cs}{\let\PYG@bf=\textbf\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@k}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kd}{\let\PYG@bf=\textbf\let\PYG@it=\textit}
|
||||||
|
\@namedef{PYG@tok@nb}{\let\PYG@bf=\textbf\let\PYG@it=\textit}
|
||||||
|
\@namedef{PYG@tok@bp}{\let\PYG@bf=\textbf\let\PYG@it=\textit}
|
||||||
|
\@namedef{PYG@tok@nn}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@nc}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@nf}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@nv}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@no}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@ow}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@s}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@err}{\def\PYG@bc##1{{\setlength{\fboxsep}{\string -\fboxrule}\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{\strut ##1}}}}
|
||||||
|
\@namedef{PYG@tok@kc}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kn}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kp}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kr}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@kt}{\let\PYG@bf=\textbf}
|
||||||
|
\@namedef{PYG@tok@fm}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vc}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vg}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vi}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@vm}{\let\PYG@bf=\textbf\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sa}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sb}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sc}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@dl}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sd}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@s2}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@se}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sh}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@si}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sx}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@sr}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@s1}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@ss}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
|
||||||
|
\@namedef{PYG@tok@ch}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cm}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@cpf}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
\@namedef{PYG@tok@c1}{\let\PYG@it=\textit\def\PYG@tc##1{\textcolor[rgb]{0.53,0.53,0.53}{##1}}}
|
||||||
|
|
||||||
|
\def\PYGZbs{\char`\\}
|
||||||
|
\def\PYGZus{\char`\_}
|
||||||
|
\def\PYGZob{\char`\{}
|
||||||
|
\def\PYGZcb{\char`\}}
|
||||||
|
\def\PYGZca{\char`\^}
|
||||||
|
\def\PYGZam{\char`\&}
|
||||||
|
\def\PYGZlt{\char`\<}
|
||||||
|
\def\PYGZgt{\char`\>}
|
||||||
|
\def\PYGZsh{\char`\#}
|
||||||
|
\def\PYGZpc{\char`\%}
|
||||||
|
\def\PYGZdl{\char`\$}
|
||||||
|
\def\PYGZhy{\char`\-}
|
||||||
|
\def\PYGZsq{\char`\'}
|
||||||
|
\def\PYGZdq{\char`\"}
|
||||||
|
\def\PYGZti{\char`\~}
|
||||||
|
% for compatibility with earlier versions
|
||||||
|
\def\PYGZat{@}
|
||||||
|
\def\PYGZlb{[}
|
||||||
|
\def\PYGZrb{]}
|
||||||
|
\makeatother
|
||||||
|
|
BIN
third/semester1/CT3532: Database Systems II/notes/images/db.png
Normal file
BIN
third/semester1/CT3532: Database Systems II/notes/images/db.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 22 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
third/semester1/CT3532: Database Systems II/slides/2PL.pdf
Normal file
BIN
third/semester1/CT3532: Database Systems II/slides/2PL.pdf
Normal file
Binary file not shown.
BIN
third/semester1/CT3532: Database Systems II/slides/BTrees.pdf
Normal file
BIN
third/semester1/CT3532: Database Systems II/slides/BTrees.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
third/semester1/CT3532: Database Systems II/slides/Hashing1.pdf
Normal file
BIN
third/semester1/CT3532: Database Systems II/slides/Hashing1.pdf
Normal file
Binary file not shown.
BIN
third/semester1/CT3532: Database Systems II/slides/Join_Sort.pdf
Normal file
BIN
third/semester1/CT3532: Database Systems II/slides/Join_Sort.pdf
Normal file
Binary file not shown.
BIN
third/semester1/CT3532: Database Systems II/slides/Lecture1.pdf
Normal file
BIN
third/semester1/CT3532: Database Systems II/slides/Lecture1.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user