Skip to content

Commit

Permalink
Add new scripts to collect Athena telemetry data
Browse files Browse the repository at this point in the history
  • Loading branch information
geoffxy committed Nov 23, 2023
1 parent 31a1d62 commit b69cc32
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 0 deletions.
29 changes: 29 additions & 0 deletions tools/one_off/extract_telemetry_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import json
from typing import Dict, List


def process_file(filename: str) -> Dict[str, List[str]]:
with open(filename, "r", encoding="UTF-8") as file:
raw = json.load(file)

results = {}
for epoch_key, inner in raw.items():
queries = []
for q in inner["athena_result"]:
queries.append(q["sql"])
results[epoch_key] = queries

return results


def main():
data1 = process_file("telemetry_workload.json")
data2 = process_file("telemetry_workload_100g.json")

combined = {**data1, **data2}
with open("telemetry_queries.json", "w", encoding="UTF-8") as file:
json.dump(combined, file, indent=2)


if __name__ == "__main__":
main()
44 changes: 44 additions & 0 deletions tools/one_off/gather_athena_telemetry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#! /bin/bash

SCRIPT_PATH=$(cd $(dirname $0) && pwd -P)
cd $SCRIPT_PATH

if [ -z $4 ]; then
echo "Usage: $0 queries_json out_dir config_file schema_name"
fi

queries_json=$1
out_dir=$2
config_file=$3
schema_name=$4

mkdir -p $out_dir/sql
mkdir -p $out_dir/raw

for epoch in "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22"; do
echo "Processing $epoch"
jq -r '.["epoch_'"$epoch"'"] | .[]' $queries_json > $out_dir/sql/epoch_${epoch}.sql

echo "Gathering data..."
python ../../run_cost_model.py \
--run_workload \
--run_workload_rank 0 \
--run_workload_world_size 1 \
--database athena \
--db_name imdb_specialized_100g \
--query_timeout 300 \
--s3_output_path "s3://geoffxy-research/athena/out" \
--source $out_dir/sql/epoch_${epoch}.sql \
--target $out_dir/raw/athena_epoch_${epoch}.json

# Expand the table.
echo "Expanding the table for the next epoch..."
python3 ../load_telemetry.py \
--config-file $config_file \
--engines athena \
--data-s3-bucket geoffxy-research \
--data-s3-path imdb_specialized_100g/telemetry/telemetry.csv \
--times 1 \
--schema-name $schema_name
fi

0 comments on commit b69cc32

Please sign in to comment.