Add new scripts to collect Athena telemetry data

mitdbg · Nov 23, 2023 · b69cc32 · b69cc32
1 parent 31a1d62
commit b69cc32
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 0 deletions.
diff --git a/tools/one_off/extract_telemetry_queries.py b/tools/one_off/extract_telemetry_queries.py
@@ -0,0 +1,29 @@
+import json
+from typing import Dict, List
+
+
+def process_file(filename: str) -> Dict[str, List[str]]:
+    with open(filename, "r", encoding="UTF-8") as file:
+        raw = json.load(file)
+
+    results = {}
+    for epoch_key, inner in raw.items():
+        queries = []
+        for q in inner["athena_result"]:
+            queries.append(q["sql"])
+        results[epoch_key] = queries
+
+    return results
+
+
+def main():
+    data1 = process_file("telemetry_workload.json")
+    data2 = process_file("telemetry_workload_100g.json")
+
+    combined = {**data1, **data2}
+    with open("telemetry_queries.json", "w", encoding="UTF-8") as file:
+        json.dump(combined, file, indent=2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/one_off/gather_athena_telemetry.sh b/tools/one_off/gather_athena_telemetry.sh
@@ -0,0 +1,44 @@
+#! /bin/bash
+
+SCRIPT_PATH=$(cd $(dirname $0) && pwd -P)
+cd $SCRIPT_PATH
+
+if [ -z $4 ]; then
+  echo "Usage: $0 queries_json out_dir config_file schema_name"
+fi
+
+queries_json=$1
+out_dir=$2
+config_file=$3
+schema_name=$4
+
+mkdir -p $out_dir/sql
+mkdir -p $out_dir/raw
+
+for epoch in "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22"; do
+    echo "Processing $epoch"
+    jq -r '.["epoch_'"$epoch"'"] | .[]' $queries_json > $out_dir/sql/epoch_${epoch}.sql
+
+    echo "Gathering data..."
+    python ../../run_cost_model.py \
+      --run_workload \
+      --run_workload_rank 0 \
+      --run_workload_world_size 1 \
+      --database athena \
+      --db_name imdb_specialized_100g \
+      --query_timeout 300 \
+      --s3_output_path "s3://geoffxy-research/athena/out" \
+      --source $out_dir/sql/epoch_${epoch}.sql \
+      --target $out_dir/raw/athena_epoch_${epoch}.json
+
+    # Expand the table.
+    echo "Expanding the table for the next epoch..."
+    python3 ../load_telemetry.py \
+        --config-file $config_file \
+        --engines athena \
+        --data-s3-bucket geoffxy-research \
+        --data-s3-path imdb_specialized_100g/telemetry/telemetry.csv \
+        --times 1 \
+        --schema-name $schema_name
+fi
+