[server]: Optimise permanent_data.py

2025-03-09 14:00:43 +00:00
parent 14607d0eac
commit d28cf0c036
1 changed files with 146 additions and 133 deletions
--- a/server/src/functions/permanent_data/permanent_data.py
+++ b/server/src/functions/permanent_data/permanent_data.py
@ -6,23 +6,31 @@ import zipfile
 import io
 import os
 import boto3
+from concurrent.futures import ThreadPoolExecutor

+# Create a reusable session for requests
+session = requests.Session()
+
+# Setup DynamoDB client for Lambda
 dynamodb = boto3.resource("dynamodb")
+table_name = os.environ.get("DYNAMODB_TABLE", "permanent_data")
+table = dynamodb.Table(table_name)

-# API URLs
 irishrail_url = "http://api.irishrail.ie/realtime/realtime.asmx/"

-# function to fetch Irish Rail station data with types
-# this function seems to be missing stations -- the API must have some uncategorised stations that it won't return
-# unfortunately, this is the only way to categorise stations as the API won't return the station's category
 def fetch_train_stations_with_type():
-    api_function = "getAllStationsXML_WithStationType?StationType="
+    """
+    Fetch train stations from the Irish Rail API with specific station types.
+
+    Returns:
+        list: A list of dictionaries containing train station data with types.
+    """
    station_types = ["M", "S", "D"]
    stations = []
-
    for station_type in station_types:
-        stations_xml  = requests.get(irishrail_url + api_function + station_type).text
-        stations_json = json.loads(json.dumps(xmltodict.parse(stations_xml)))
+        response = session.get(irishrail_url + f"getAllStationsXML_WithStationType?StationType={station_type}")
+        stations_xml = response.text
+        stations_json = xmltodict.parse(stations_xml)

        for station in stations_json["ArrayOfObjStation"]["objStation"]:
            stations.append({
@ -30,56 +38,51 @@ def fetch_train_stations_with_type():
                "objectType": "IrishRailStation",
                "latitude": station["StationLatitude"],
                "longitude": station["StationLongitude"],
-
                "trainStationID": station["StationId"],
                "trainStationCode": station["StationCode"],
-                "trainStationAlias": station["StationAlias"],
+                "trainStationAlias": station.get("StationAlias", ""),
                "trainStationDesc": station["StationDesc"],
                "trainStationType": station_type
            })
-
    return stations

-# function to fetch Irish Rail station data without types
 def fetch_train_stations():
-    api_function = "getAllStationsXML"
-    stations = []
+    """
+    Fetch all train stations from the Irish Rail API.

-    stations_xml  = requests.get(irishrail_url + api_function).text
-    stations_json = json.loads(json.dumps(xmltodict.parse(stations_xml)))
-
-    for station in stations_json["ArrayOfObjStation"]["objStation"]:
-        stations.append({
+    Returns:
+        list: A list of dictionaries containing train station data.
+    """
+    response = session.get(irishrail_url + "getAllStationsXML")
+    stations_xml = response.text
+    stations_json = xmltodict.parse(stations_xml)
+    stations = [{
        "objectID": "IrishRailStation-" + station["StationCode"],
        "objectType": "IrishRailStation",
        "latitude": station["StationLatitude"],
        "longitude": station["StationLongitude"],
-
        "trainStationID": station["StationId"],
        "trainStationCode": station["StationCode"],
-            "trainStationAlias": station["StationAlias"],
-            "trainStationDesc": station["StationDesc"],
-        })
-
+        "trainStationAlias": station.get("StationAlias", ""),
+        "trainStationDesc": station["StationDesc"]
+    } for station in stations_json["ArrayOfObjStation"]["objStation"]]
    return stations

-
-
-# function to fetch Luas stops data
 def fetch_luas():
-    stops = []
+    """
+    Fetch Luas stops from the TII dataset.

-    stops_tsv = requests.get("https://data.tii.ie/Datasets/Luas/StopLocations/luas-stops.txt").content.decode('utf-8-sig')
+    Returns:
+        list: A list of dictionaries containing Luas stop data.
+    """
+    response = session.get("https://data.tii.ie/Datasets/Luas/StopLocations/luas-stops.txt")
+    stops_tsv = response.content.decode('utf-8-sig')
    tsv_reader = csv.DictReader(stops_tsv.splitlines(), delimiter="\t")
-    stops_json = [row for row in tsv_reader]
-
-    for stop in stops_json:
-        stops.append({
+    stops = [{
        "objectID": "LuasStop-" + stop["Abbreviation"],
        "objectType": "LuasStop",
        "latitude": stop["Latitude"],
        "longitude": stop["Longitude"],
-
        "luasStopName": stop["Name"],
        "luasStopIrishName": stop["IrishName"],
        "luasStopID": stop["StopID"],
@ -90,109 +93,119 @@ def fetch_luas():
        "luasStopIsParkAndRide": stop["IsParkAndRide"],
        "luasStopIsCycleAndRide": stop["IsCycleAndRide"],
        "luasStopZoneCountA": stop["ZoneCountA"],
-            "luasStopZoneCountB": stop["ZoneCountB"],
-        })
-
+        "luasStopZoneCountB": stop["ZoneCountB"]
+    } for stop in tsv_reader]
    return stops

-
 def fetch_gtfs():
-    data = []
+    """
+    Fetch GTFS data from the Transport for Ireland dataset.
+
+    Returns:
+        list: A list of dictionaries containing GTFS data.
+    """
    url = "https://www.transportforireland.ie/transitData/Data/GTFS_All.zip"
-    zip_file = requests.get(url).content
+    zip_file = session.get(url).content
+    data = []

    with zipfile.ZipFile(io.BytesIO(zip_file)) as zip:
-        # will need to access the list of agencies for later objects, so keeping separate
-        agencies = []
-
-        # extract agencies data
        if "agency.txt" in zip.namelist():
            with zip.open("agency.txt") as file:
                agencies_csv = file.read().decode('utf-8')
-                csv_reader = csv.DictReader(agencies_csv.splitlines(), delimiter=",")
-                agencies_json = [row for row in csv_reader]
-
-                for agency in agencies_json:
-                    agencies.append({
+                agencies = [{
                    "objectID": "BusAgency" + agency["agency_id"],
                    "objectType": "BusAgency",
-                        # no latitude or longitude
-
                    "busAgencyID": agency["agency_id"],
                    "busAgencyName": agency["agency_name"],
                    "busAgencyURL": agency["agency_url"]
-                    })
+                } for agency in csv.DictReader(agencies_csv.splitlines())]
+                data.extend(agencies)

-        data += agencies
-
-        # extract routes data
        if "routes.txt" in zip.namelist():
            with zip.open("routes.txt") as file:
                routes_csv = file.read().decode('utf-8')
-                csv_reader = csv.DictReader(routes_csv.splitlines(), delimiter=",")
-                routes_json = [row for row in csv_reader]
-
-                for route in routes_json:
-                    data.append({
+                data.extend([{
                    "objectID": "BusRoute-" + route["route_id"],
                    "objectType": "BusRoute",
-                        # no latitude or longitude
-
                    "busRouteID": route["route_id"],
                    "busRouteAgencyID": route["agency_id"],
-                        "busRouteAgencyName": next((agency['busAgencyName'] for agency in agencies if agency['busAgencyID'] == route["agency_id"]), None),
                    "busRouteShortName": route["route_short_name"],
-                        "busRouteLongName": route["route_long_name"]
-                    })
+                    "busRouteLongName": route["route_long_name"],
+                    "busRouteAgencyName": next((agency['busAgencyName'] for agency in data if agency['busAgencyID'] == route["agency_id"]), None)
+                } for route in csv.DictReader(routes_csv.splitlines())])

-        # extract stops data
        if "stops.txt" in zip.namelist():
            with zip.open("stops.txt") as file:
                stops_csv = file.read().decode('utf-8')
-                csv_reader = csv.DictReader(stops_csv.splitlines(), delimiter=",")
-                stops_json = [row for row in csv_reader]
-
-                for stop in stops_json:
-                    data.append({
+                data.extend([{
                    "objectID": "BusStop-" + stop["stop_id"],
                    "objectType": "BusStop",
                    "latitude": stop["stop_lat"],
                    "longitude": stop["stop_lon"],
-
                    "busStopID": stop["stop_id"],
-                        "busStopCode": stop["stop_code"],
+                    "busStopCode": stop.get("stop_code", ""),
                    "busStopName": stop["stop_name"]
-                    })
-
+                } for stop in csv.DictReader(stops_csv.splitlines())])
    return data

+def batch_upload_to_dynamodb(data):
+    """
+    Batch upload data to DynamoDB.
+
+    Args:
+        data (list): A list of dictionaries containing data to be uploaded.
+    """
+    with table.batch_writer() as batch:
+        for item in data:
+            batch.put_item(Item=item)

 def lambda_handler(event, context):
+    """
+    AWS Lambda handler to fetch data and upload it to DynamoDB.
+
+    Args:
+        event (dict): Event data passed to the Lambda function.
+        context (object): Runtime information of the Lambda function.
+
+    Returns:
+        dict: A dictionary containing the status code and message.
+    """
    print("Lambda Handler invoked! Retrieving data...")
-    data = fetch_train_stations() + fetch_luas() + fetch_gtfs()
-    print("Data retrieved successfully")

-    table_name = os.environ.get("DYNAMODB_TABLE", "permanent_data")
-    table = dynamodb.Table(table_name)
+    with ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(fetch_train_stations),
+            executor.submit(fetch_luas),
+            executor.submit(fetch_gtfs)
+        ]
+        data = []
+        for future in futures:
+            data.extend(future.result())

-    print("Attempting to batch upload retrieved data")
-
-    try:
-        with table.batch_writer() as batch:
-            for record in data:
-                batch.put_item(Item=record)
-
-        print("done uploading")
+    print(f"Retrieved {len(data)} records.")
+    print("Uploading to DynamoDB...")
+    chunk_size = 25
+    for i in range(0, len(data), chunk_size):
+        batch_upload_to_dynamodb(data[i:i + chunk_size])
+    print("Upload completed.")

    return {
        'statusCode': 200,
-            'body': json.dumps({'message': 'Data inserted successfully!'})
+        'body': json.dumps({'message': 'Data uploaded successfully!'})
    }

-    except Exception as e:
-        return {"statusCode": 500, "error": str(e)}
+if __name__ == "__main__":
+    """
+    Main function to fetch data and print it locally.
+    """
+    with ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(fetch_train_stations),
+            executor.submit(fetch_luas),
+            executor.submit(fetch_gtfs)
+        ]
+        data = []
+        for future in futures:
+            data.extend(future.result())

-
-if "__main__" == __name__:
-    data = fetch_train_stations() + fetch_luas() + fetch_gtfs()
    print(json.dumps(data))