diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..fc2895f Binary files /dev/null and b/.DS_Store differ diff --git a/Dockerfile b/Dockerfile index e71ecc4..3b43b8b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,6 +16,7 @@ ADD scripts/enable_api.sh /opt/scripts/enable_api.sh ADD scripts/estimate_billing.py /opt/scripts/estimate_billing.py ADD scripts/persist_artifacts.py /opt/scripts/persist_artifacts.py ADD scripts/costs_json_to_csv.py /opt/scripts/costs_json_to_csv.py +ADD scripts/cost_script.py /opt/scripts/cost_script.py # GMS setup/run ADD gms/resources.sh /opt/gms/resources.sh diff --git a/scripts/README.md b/scripts/README.md index 88b793f..746746d 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -110,6 +110,14 @@ This functionality is also wrapped into estimate\_billing.py under the I'd still run these separately just to have both, but if you're only after the CSV this may be more convenient. +# cost\_script.py + +Takes the output of costs_json_to_csv.py and collapses tasks that have been split into shards, giving one cost for the entire task. +It outputs a csv labeled costs_report_final.csv. + +Use as follows- + + python3 /opt/scripts/cost_script.py costs.tsv # Troubleshooting scripts diff --git a/scripts/cost_script.py b/scripts/cost_script.py new file mode 100644 index 0000000..ea7087b --- /dev/null +++ b/scripts/cost_script.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +""" +Converts costs TSV file to summary costs TSV file + +Usage: cost_script.py [costs tsv file] +""" +#Import modules +import sys +import pandas as pd +import regex as re + +file=sys.argv[1] + +#initialize list called table where we'll store all the values from tsv +table = [] +with open(file) as f: + for line in f: + L = line.split('\t') #split by tab + table.append(L) + +#delete anything that resembles 'shard' followed by a number. +for i in table: + if "shard" in i[0]: + if "retry" in i[0]: +# print("retry",i[0]) + i[0] = re.sub('_shard-\d+','',i[0]) +# print(i[0]) + else: +# print("no retry",i[0]) + i[0] = re.sub('_shard-\d+','',i[0]) +# print(i[0]) + + +#convert list of lists to pandas dataframe using first list item as header. Grab specific columns we want. Drop the first row because it's just the list of column names +table_df = pd.DataFrame(table, columns=table[0]) +table_df = table_df[["callName","totalCost","cpuCost","memoryCost","diskCost"]] +table_df=table_df.drop([0]) + +#convert all numerical values from strings to floats +table_df = table_df.astype({'totalCost':'float','cpuCost':'float','memoryCost':'float','diskCost':'float'}) + +#sum all rows with same callname +table_df_sum = table_df.groupby("callName").sum() + +#sort by descending order of total cost +table_df_sum=table_df_sum.sort_values(by=['totalCost'], ascending=False) + +#save to csv +table_df_sum.to_csv('costs_report_final.csv', index=True) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 76edcda..69eaf48 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,3 +1,6 @@ +numpy +pandas +regex cwl_utils miniwdl == 1.2.1