From 007cd36be7254de79ab1b47b6ed39fd1718cc15f Mon Sep 17 00:00:00 2001 From: ksinghal28 Date: Thu, 13 Oct 2022 10:28:20 -0500 Subject: [PATCH 1/4] KS-added cost script and changed dockerfile and requirements file to account for dependencies --- .DS_Store | Bin 0 -> 6148 bytes Dockerfile | 1 + scripts/cost_script.py | 53 +++++++++++++++++++++++++++++++++++++++ scripts/requirements.txt | 3 +++ 4 files changed, 57 insertions(+) create mode 100644 .DS_Store create mode 100644 scripts/cost_script.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..fc2895fccb6bb05e86d0ff15dddedc24efd642d3 GIT binary patch literal 6148 zcmeHK%}T>S5T3QwrW7FuMUM+!3&x)o#7n641&ruHr8c%`FlI}V+CwSisxRc5_&m<+ zZoq26n~0r(-EVe&b~7Jje*i#q#)CZo7XTb=go2a>Lgt086&p+_^c*9|AqywjD2x|O z^cPL^?KQ~Y4pNBV)Ay?oJ#XPCj-5tr8l5W@ zho#;RFQVb3=Wd*+B=1MbFx3gsV2B~tmr*iMla3lEgG}cJ#^KbRde7aOPTQ@fYkO{%a@3}MpIE^VA|u`p=TLFkon9=me+coBMav`ZZh!Z*l0Gr$bYGf*)@m(KrF z{ACtC^5;`{#0)S4|BL}q?FQWr7G=-YZ{^WhE3w^TBcZs06cp6wE&({8ePmlZjbD Date: Thu, 13 Oct 2022 10:57:22 -0500 Subject: [PATCH 2/4] Updating README for cost_script.py addition --- scripts/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/README.md b/scripts/README.md index 88b793f..9a77f2e 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -110,6 +110,16 @@ This functionality is also wrapped into estimate\_billing.py under the I'd still run these separately just to have both, but if you're only after the CSV this may be more convenient. +# cost\_script.py + +This is a script to be used on the costs tsv. + +It will summarize the outputs of the tsv, adding up the costs for the same step. +And output a csv labeled costss_report_final.csv + +Use as follows- + + python3 /opt/scripts/cost_script.py costs.tsv # Troubleshooting scripts From 8fa79c928e9761c31346b72d68a33ecaa61d1896 Mon Sep 17 00:00:00 2001 From: ksinghal28 <36466671+ksinghal28@users.noreply.github.com> Date: Thu, 13 Oct 2022 21:09:25 -0500 Subject: [PATCH 3/4] Updating to make regex command cleaner --- scripts/cost_script.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/scripts/cost_script.py b/scripts/cost_script.py index 818605d..ea7087b 100644 --- a/scripts/cost_script.py +++ b/scripts/cost_script.py @@ -19,19 +19,16 @@ L = line.split('\t') #split by tab table.append(L) -#delete anything that resembles 'shard' followed by a number. If there's a 3-digit shard at some point, add a re.sub(\d\d\d) line before the re.sub(\d\d) line. -#have to go in descending order of numerical digits because otherwise will delete shard# and be left with name-of-task# which won't get deleted. +#delete anything that resembles 'shard' followed by a number. for i in table: if "shard" in i[0]: if "retry" in i[0]: # print("retry",i[0]) - i[0] = re.sub('_shard-\d\d','',i[0]) - i[0] = re.sub('_shard-\d','',i[0]) + i[0] = re.sub('_shard-\d+','',i[0]) # print(i[0]) else: # print("no retry",i[0]) - i[0] = re.sub('_shard-\d\d','',i[0]) - i[0] = re.sub('_shard-\d','',i[0]) + i[0] = re.sub('_shard-\d+','',i[0]) # print(i[0]) @@ -50,4 +47,4 @@ table_df_sum=table_df_sum.sort_values(by=['totalCost'], ascending=False) #save to csv -table_df_sum.to_csv('costs_report_final.csv', index=True) \ No newline at end of file +table_df_sum.to_csv('costs_report_final.csv', index=True) From 1e1d1527e214969d689e3f30a9f5375f640e180e Mon Sep 17 00:00:00 2001 From: ksinghal28 <36466671+ksinghal28@users.noreply.github.com> Date: Thu, 13 Oct 2022 21:12:45 -0500 Subject: [PATCH 4/4] Updating README verbiage to be clearer --- scripts/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 9a77f2e..746746d 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -112,10 +112,8 @@ after the CSV this may be more convenient. # cost\_script.py -This is a script to be used on the costs tsv. - -It will summarize the outputs of the tsv, adding up the costs for the same step. -And output a csv labeled costss_report_final.csv +Takes the output of costs_json_to_csv.py and collapses tasks that have been split into shards, giving one cost for the entire task. +It outputs a csv labeled costs_report_final.csv. Use as follows-