From 351f55968f72384a1f59a898a1e81fe620a344de Mon Sep 17 00:00:00 2001 From: Zhicheng Huang Date: Wed, 29 Nov 2023 16:56:40 -0500 Subject: [PATCH 1/2] Uprev benchmark links from data branch --- .../benchmarks/bio/bio-align/genome-diff.sh | 2 +- .../benchmarks/bio/bio-align/genquality.sh | 2 +- evaluation/benchmarks/bio/bio1/setup.sh | 2 +- .../max-temp/max-temp-preprocess.sh | 4 +- evaluation/benchmarks/max-temp/max-temp.sh | 2 +- .../benchmarks/max-temp/temp-analytics.sh | 2 +- evaluation/benchmarks/nlp/input/setup.sh | 2 +- .../benchmarks/oneliners/input/setup.sh | 8 +- .../benchmarks/web-index/input/setup.sh | 3 +- evaluation/intro/README.md | 5 + evaluation/intro/input/setup.sh | 26 +- evaluation/intro/test.sh | 4 +- evaluation/osdi22-eval/run_all.sh | 41 +--- evaluation/other/circular/sq.sh | 1 + evaluation/other/more-scripts/page-count.sh | 2 +- evaluation/other/more-scripts/spell.sh | 2 +- evaluation/tests/input/setup.sh | 29 ++- evaluation/tests/interface_tests/env_vars.sh | 9 + evaluation/tests/interface_tests/redir-dup.sh | 3 + .../tests/interface_tests/redir-var-test.sh | 10 + evaluation/tests/interface_tests/run.sh | 45 +++- .../tests/interface_tests/test-exclam.sh | 3 + evaluation/tests/interface_tests/test-star.sh | 12 + evaluation/tests/minimal_grep_stdin_test.in | 2 +- evaluation/tests/sed-test.sh | 6 +- evaluation/tests/shortest_scripts.sh | 4 +- evaluation/tests/test_evaluation_scripts.sh | 229 ++++++++++++++++++ 27 files changed, 372 insertions(+), 88 deletions(-) create mode 100644 evaluation/intro/README.md create mode 100644 evaluation/tests/interface_tests/env_vars.sh create mode 100644 evaluation/tests/interface_tests/redir-dup.sh create mode 100644 evaluation/tests/interface_tests/redir-var-test.sh create mode 100755 evaluation/tests/interface_tests/test-exclam.sh create mode 100644 evaluation/tests/interface_tests/test-star.sh create mode 100755 evaluation/tests/test_evaluation_scripts.sh diff --git a/evaluation/benchmarks/bio/bio-align/genome-diff.sh b/evaluation/benchmarks/bio/bio-align/genome-diff.sh index a269f9e95..c82061797 100755 --- a/evaluation/benchmarks/bio/bio-align/genome-diff.sh +++ b/evaluation/benchmarks/bio/bio-align/genome-diff.sh @@ -11,7 +11,7 @@ # bacteria), and any regions with less than 10 supporting reads. # Requires: samtools, minimap2, bcftools -# Data: http://ndr.md/data/bio/R1.fastq.gz http://ndr.md/data/bio/R2.fastq.gz http://ndr.md/data/bio/ref.fa +# Data: atlas-group.cs.brown.edu/data/bio/R1.fastq.gz atlas-group.cs.brown.edu/data/bio/R2.fastq.gz atlas-group.cs.brown.edu/data/bio/ref.fa # https://github.com/samtools/samtools/releases/latest # https://github.com/lh3/minimap2 diff --git a/evaluation/benchmarks/bio/bio-align/genquality.sh b/evaluation/benchmarks/bio/bio-align/genquality.sh index 64c777fdd..62c731960 100755 --- a/evaluation/benchmarks/bio/bio-align/genquality.sh +++ b/evaluation/benchmarks/bio/bio-align/genquality.sh @@ -6,7 +6,7 @@ # http://thegenomefactory.blogspot.com/2019/09/25-reasons-assemblies-dont-make-it-into.html # Require: csvkit -# Data: http://ndr.md/data/bio/genbank.txt +# Data: atlas-group.cs.brown.edu/data/bio/genbank.txt IN=./input/genbank.txt OUT=./output/out.txt diff --git a/evaluation/benchmarks/bio/bio1/setup.sh b/evaluation/benchmarks/bio/bio1/setup.sh index 40bdd47cb..9c2bb1629 100644 --- a/evaluation/benchmarks/bio/bio1/setup.sh +++ b/evaluation/benchmarks/bio/bio1/setup.sh @@ -8,7 +8,7 @@ mkdir -p input mkdir -p output cd input if [[ ! -f R1.fastq ]]; then - wget ndr.md/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa} + wget atlas-group.cs.brown.edu/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa} gunzip R1.fastq.gz gunzip R2.fastq.gz diff --git a/evaluation/benchmarks/max-temp/max-temp-preprocess.sh b/evaluation/benchmarks/max-temp/max-temp-preprocess.sh index e3d4b98c5..8d0719049 100755 --- a/evaluation/benchmarks/max-temp/max-temp-preprocess.sh +++ b/evaluation/benchmarks/max-temp/max-temp-preprocess.sh @@ -1,12 +1,12 @@ #!/bin/bash -sed 's;^;http://ndr.md/data/noaa/;' | +sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' | sed 's;$;/;' | xargs -r -n 1 curl -s | grep gz | tr -s ' \n' | cut -d ' ' -f9 | sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | - sed 's;^;http://ndr.md/data/noaa/;' | + sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' | xargs -n1 curl -s | gunzip diff --git a/evaluation/benchmarks/max-temp/max-temp.sh b/evaluation/benchmarks/max-temp/max-temp.sh index b0c18aaa8..b74f72b10 100755 --- a/evaluation/benchmarks/max-temp/max-temp.sh +++ b/evaluation/benchmarks/max-temp/max-temp.sh @@ -2,7 +2,7 @@ FROM=${FROM:-2015} TO=${TO:-2015} -IN=${IN:-'http://ndr.md/data/noaa/'} +IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'} fetch=${fetch:-"curl -s"} seq $FROM $TO | diff --git a/evaluation/benchmarks/max-temp/temp-analytics.sh b/evaluation/benchmarks/max-temp/temp-analytics.sh index 319a8f0e4..a1399fa7d 100755 --- a/evaluation/benchmarks/max-temp/temp-analytics.sh +++ b/evaluation/benchmarks/max-temp/temp-analytics.sh @@ -2,7 +2,7 @@ FROM=${FROM:-2015} TO=${TO:-2015} -IN=${IN:-'http://ndr.md/data/noaa/'} +IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'} fetch=${fetch:-"curl -s"} data_file=temperatures.txt diff --git a/evaluation/benchmarks/nlp/input/setup.sh b/evaluation/benchmarks/nlp/input/setup.sh index 5486b39f2..a26a9cf19 100755 --- a/evaluation/benchmarks/nlp/input/setup.sh +++ b/evaluation/benchmarks/nlp/input/setup.sh @@ -20,7 +20,7 @@ setup_dataset() { cd pg if [[ "$1" == "--full" ]]; then echo 'N.b.: download/extraction will take about 10min' - wget ndr.md/data/pg.tar.xz + wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon if [ $? -ne 0 ]; then cat <<-'EOF' | sed 's/^ *//' Downloading input dataset failed, thus need to manually rsync all books from project gutenberg: diff --git a/evaluation/benchmarks/oneliners/input/setup.sh b/evaluation/benchmarks/oneliners/input/setup.sh index 96388980d..eb8a00317 100755 --- a/evaluation/benchmarks/oneliners/input/setup.sh +++ b/evaluation/benchmarks/oneliners/input/setup.sh @@ -26,7 +26,7 @@ setup_dataset() { fi if [ ! -f ./1M.txt ]; then - curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt + curl -sf 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt if [ $? -ne 0 ]; then echo 'cannot find 1M.txt -- please contact the developers of pash' exit 1 @@ -51,7 +51,7 @@ setup_dataset() { fi if [ ! -f ./1G.txt ]; then - curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt + curl -sf 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt if [ $? -ne 0 ]; then echo 'cannot find 1G.txt -- please contact the developers of pash' exit 1 @@ -61,7 +61,7 @@ setup_dataset() { # download wamerican-insane dictionary and sort according to machine if [ ! -f ./dict.txt ]; then - curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt + curl -sf 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt if [ $? -ne 0 ]; then echo 'cannot find dict.txt -- please contact the developers of pash' exit 1 @@ -70,7 +70,7 @@ setup_dataset() { fi if [ ! -f ./all_cmds.txt ]; then - curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt + curl -sf 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt if [ $? -ne 0 ]; then # This should be OK for tests, no need for abort ls /usr/bin/* > all_cmds.txt diff --git a/evaluation/benchmarks/web-index/input/setup.sh b/evaluation/benchmarks/web-index/input/setup.sh index 72a4fd8f9..79a77276a 100755 --- a/evaluation/benchmarks/web-index/input/setup.sh +++ b/evaluation/benchmarks/web-index/input/setup.sh @@ -17,8 +17,7 @@ setup_dataset() { wget $wiki_archive || eexit "cannot fetch wikipedia" 7za x wikipedia-en-html.tar.7z tar -xvf wikipedia-en-html.tar - wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices" - # It is actually OK if we don't have this index since we download the 500/1000 below + wget atlas-group.cs.brown.edu/data/wikipedia/index.txt # FIXME: we download index below? fi if [ "$1" = "--small" ]; then diff --git a/evaluation/intro/README.md b/evaluation/intro/README.md new file mode 100644 index 000000000..194e1bf58 --- /dev/null +++ b/evaluation/intro/README.md @@ -0,0 +1,5 @@ +To create the input files needed for evaluation, run: + +```bash +./input/setup.sh +``` diff --git a/evaluation/intro/input/setup.sh b/evaluation/intro/input/setup.sh index a524e9e56..c2eaa684d 100755 --- a/evaluation/intro/input/setup.sh +++ b/evaluation/intro/input/setup.sh @@ -6,27 +6,33 @@ cd $(dirname $0) [ "$1" = "-c" ] && rm-files 100M.txt words sorted_words + if [ ! -f ./100M.txt ]; then - curl -f 'ndr.md/data/dummy/100M.txt' > 100M.txt + curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/100M.txt' > 100M.txt if [ $? -ne 0 ]; then - curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt + # Pipe curl through tac (twice) in order to consume all the output from curl. + # This way, curl can write the whole page and not emit an error code. + curl -fL 'http://www.gutenberg.org/files/2600/2600-0.txt' | tac | tac | head -c 1M > 1M.txt [ $? -ne 0 ] && eexit 'cannot find 1M.txt' touch 100M.txt - for (( i = 0; i < 10; i++ )); do - cat 1M.txt >> 10M.txt + for (( i = 0; i < 100; i++ )); do + cat 1M.txt >> 100M.txt done fi append_nl_if_not ./100M.txt fi if [ ! -f ./words ]; then - curl -f 'http://ndr.md/data/dummy/words' > words + curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words if [ $? -ne 0 ]; then - if [ $(uname) = 'Darwin' ]; then - cp /usr/share/dict/web2 words || eexit "cannot find dict file" - else - # apt install wamerican-insane - cp /usr/share/dict/words words || eexit "cannot find dict file" + curl -sf 'https://zenodo.org/record/7650885/files/words' > words + if [ $? -ne 0 ]; then + if [ $(uname) = 'Darwin' ]; then + cp /usr/share/dict/web2 words || eexit "cannot find dict file" + else + # apt install wamerican-insane + cp /usr/share/dict/words words || eexit "cannot find dict file" + fi fi fi append_nl_if_not words diff --git a/evaluation/intro/test.sh b/evaluation/intro/test.sh index a72232832..3563fb315 100755 --- a/evaluation/intro/test.sh +++ b/evaluation/intro/test.sh @@ -17,10 +17,10 @@ run_test() local test=$1 echo -n "Running $test..." TIMEFORMAT="${test%%.*}:%3R" # %3U %3S" - { time $bash "$test" > "$output_dir/$test.bash.out"; } 2>> $output_dir/results.time_bash + { time $bash "$test" > "$output_dir/$test.bash.out"; } 2> >(tee -a $output_dir/results.time_bash) test_bash_ec=$? TIMEFORMAT="%3R" # %3U %3S" - { time $pash "$test" > "$output_dir/$test.pash.out"; } 2>> $output_dir/results.time_pash + { time $pash "$test" > "$output_dir/$test.pash.out"; } 2> >(tee -a $output_dir/results.time_pash) test_pash_ec=$? diff "$output_dir/$test.bash.out" "$output_dir/$test.pash.out" test_diff_ec=$? diff --git a/evaluation/osdi22-eval/run_all.sh b/evaluation/osdi22-eval/run_all.sh index 2c9379ae9..d6d26f032 100755 --- a/evaluation/osdi22-eval/run_all.sh +++ b/evaluation/osdi22-eval/run_all.sh @@ -59,46 +59,7 @@ run_bench() { done } -function run_comm_du_benchmarks() { - # generate output folder for each run - export RES_FOLDER=$1 - # clean previous runs - rm -rf ${RES_FOLDER} - mkdir -p ${RES_FOLDER} - cd ${PASH_TOP}/evaluation/benchmarks - # remove all res files from previous runs - find . -type d -name "outputs" | xargs rm -rf - # do not remove any input from the node_modules dataset - find . -type d -not -path "*/node_modules/*" -name "output" | xargs rm -rf - find . -type d -name "pash_logs" | xargs rm -rf - find . -type f -name "*.res" | xargs rm -f - export PASH_BENCHMARK=("oneliners" "unix50" "analytics-mts" "nlp" "max-temp" "dependency_untangling") - - echo 'Running all benchmark for bash' - time run_bash - - echo 'Running commutativity benchmarks' - export PASH_ALL_FLAGS=("--dgsh_tee --width 16" - "--dgsh_tee --r_split --width 16" ) - export PASH_BENCHMARK=( "oneliners" "unix50" "analytics-mts" "max-temp") - export PASH_MODE=( "disabled_commutativity" - "enabled_commutativity" ) - time run_bench - - echo 'Running dependency untangling benchmarks' - export PASH_ALL_FLAGS=("--r_split --dgsh_tee " - "--r_split --dgsh_tee --parallel_pipelines" ) - export PASH_BENCHMARK=( "nlp" "dependency_untangling" ) - export PASH_MODE=( "disabled_dependency_untangling" - "enabled_dependency_untangling" ) - - time run_bench - - # kill the hanging processes - pkill -f cat -} - -function run_all_benchmarks() { +run_all_benchmarks() { # generate output folder for each run export RES_FOLDER=$1 # clean previous runs diff --git a/evaluation/other/circular/sq.sh b/evaluation/other/circular/sq.sh index 89520e695..bce2a72cd 100755 --- a/evaluation/other/circular/sq.sh +++ b/evaluation/other/circular/sq.sh @@ -2,6 +2,7 @@ # Clever trick that uses the /dev/fd/xx pseudo-file system # https://stackoverflow.com/questions/40244/how-to-make-a-pipe-loop-in-bash +# MMG 2022-06-30 the `function` kw is a bash-ism; leaving it in to not disrupt what gets optimized in previous evaluations function calc() { # calculate sum of squares of numbers 0,..,10 diff --git a/evaluation/other/more-scripts/page-count.sh b/evaluation/other/more-scripts/page-count.sh index b4a3326e5..c4d89ecfd 100755 --- a/evaluation/other/more-scripts/page-count.sh +++ b/evaluation/other/more-scripts/page-count.sh @@ -5,7 +5,7 @@ # Require: libimage-exiftool-perl, bc # Data: -# http://ndr.md/data/dummy/large.pdf +# atlas-group.cs.brown.edu/data/large.pdf # More data: # https://arxiv.org/help/bulk_data diff --git a/evaluation/other/more-scripts/spell.sh b/evaluation/other/more-scripts/spell.sh index 1d4a9f330..9fd5e7384 100755 --- a/evaluation/other/more-scripts/spell.sh +++ b/evaluation/other/more-scripts/spell.sh @@ -6,7 +6,7 @@ # TODO: `groff is an interesting "pure", whose wrapper only needs split input # TODO: files carefully. -# Data: http://ndr.md/data/dummy/ronn.1 +# Data: atlas-group.cs.brown.edu/data/dummy/ronn.1 # dict depends on the system (and has to be sorted), so we assume it exists dict=./input/dict.txt diff --git a/evaluation/tests/input/setup.sh b/evaluation/tests/input/setup.sh index ac78afd20..ccc6712fe 100755 --- a/evaluation/tests/input/setup.sh +++ b/evaluation/tests/input/setup.sh @@ -16,17 +16,23 @@ esac [ "$1" = "-c" ] && rm-files 1M.txt all_cmds.txt words sorted_words 10M.txt if [ ! -f ./1M.txt ]; then - curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt + curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt if [ $? -ne 0 ]; then - curl -sf 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1${head_sz} > 1M.txt - [ $? -ne 0 ] && eexit 'cannot find 1M.txt' + curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt + if [ $? -ne 0 ]; then + curl -sf 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1${head_sz} > 1M.txt + [ $? -ne 0 ] && eexit 'cannot find 1M.txt' + fi fi append_nl_if_not ./1M.txt fi if [ ! -f ./all_cmds.txt ]; then if [ "$(hostname)" = "deathstar" ]; then - curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found" + curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt + if [ $? -ne 0 ]; then + curl -f 'https://zenodo.org/record/7650885/files/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found" + fi else ls /usr/bin/* > all_cmds.txt fi @@ -34,13 +40,16 @@ if [ ! -f ./all_cmds.txt ]; then fi if [ ! -f ./words ]; then - curl -sf 'http://ndr.md/data/dummy/words' > words + curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words if [ $? -ne 0 ]; then - if [ $(uname) = 'Darwin' ]; then - cp /usr/share/dict/web2 words || eexit "cannot find dict file" - else - # apt install wamerican-insane - cp /usr/share/dict/words words || eexit "cannot find dict file" + curl -f 'https://zenodo.org/record/7650885/files/words' > words + if [ $? -ne 0 ]; then + if [ $(uname) = 'Darwin' ]; then + cp /usr/share/dict/web2 words || eexit "cannot find dict file" + else + # apt install wamerican-insane + cp /usr/share/dict/words words || eexit "cannot find dict file" + fi fi fi append_nl_if_not words diff --git a/evaluation/tests/interface_tests/env_vars.sh b/evaluation/tests/interface_tests/env_vars.sh new file mode 100644 index 000000000..784a4ae6d --- /dev/null +++ b/evaluation/tests/interface_tests/env_vars.sh @@ -0,0 +1,9 @@ +myfunction() { + env | sort > tmp1.txt +} +shellvar1=123456 +shellvar2="This is several words" +shellvar3=" xxx " +export shellvar2 +trap myfunction EXIT +env | sort > tmp2.txt diff --git a/evaluation/tests/interface_tests/redir-dup.sh b/evaluation/tests/interface_tests/redir-dup.sh new file mode 100644 index 000000000..107d956b0 --- /dev/null +++ b/evaluation/tests/interface_tests/redir-dup.sh @@ -0,0 +1,3 @@ +(echo one >&2) 2>&1 +(echo two >&2) 2>- +(echo three >&2) 2>&1 diff --git a/evaluation/tests/interface_tests/redir-var-test.sh b/evaluation/tests/interface_tests/redir-var-test.sh new file mode 100644 index 000000000..e82ffd7cb --- /dev/null +++ b/evaluation/tests/interface_tests/redir-var-test.sh @@ -0,0 +1,10 @@ +#!/bin/sh +func_emit_tests_Makefile_am () +{ + ofd=3 + { + echo hi + } >&$ofd +} +fd=1 +echo hi >&$fd diff --git a/evaluation/tests/interface_tests/run.sh b/evaluation/tests/interface_tests/run.sh index 6a1a7fb01..e0cd53cf1 100755 --- a/evaluation/tests/interface_tests/run.sh +++ b/evaluation/tests/interface_tests/run.sh @@ -4,7 +4,7 @@ export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject- # time: print real in seconds, to simplify parsing bash="bash" -pash="$PASH_TOP/pa.sh --parallel_pipelines --r_split --dgsh_tee --profile_driven" +pash="$PASH_TOP/pa.sh --parallel_pipelines --profile_driven" output_dir="$PASH_TOP/evaluation/tests/interface_tests/output" rm -rf "$output_dir" @@ -142,13 +142,10 @@ test14() $shell +a readonly.sh } -## Checks interactivity -## -## TODO: Make the interactivity script more elaborate (variable dependencies) test15() { local shell=$1 - $shell < readonly.sh + $shell readonly.sh } test16() @@ -291,6 +288,39 @@ test_var_assgn_default() $shell var_assgn.sh } +test_exclam() +{ + local shell=$1 + $shell test-exclam.sh +} + +test_redir_var_test() +{ + local shell=$1 + $shell redir-var-test.sh +} + +test_star() +{ + local shell=$1 + $shell test-star.sh foo '*' baz 'hi michael' "abc + dfg" +} + +test_env_vars() +{ + local shell=$1 + rm -f tmp1.txt tmp2.txt + $shell env_vars.sh + diff tmp1.txt tmp2.txt +} + +test_redir_dup() +{ + local shell=$1 + $shell redir-dup.sh +} + ## We run all tests composed with && to exit on the first that fails if [ "$#" -eq 0 ]; then run_test test1 @@ -330,6 +360,11 @@ if [ "$#" -eq 0 ]; then run_test test_expand_u_positional run_test test_quoting run_test test_var_assgn_default + run_test test_exclam + run_test test_redir_var_test + run_test test_star + run_test test_env_vars + run_test test_redir_dup else for testname in $@ do diff --git a/evaluation/tests/interface_tests/test-exclam.sh b/evaluation/tests/interface_tests/test-exclam.sh new file mode 100755 index 000000000..8fb0eeeb1 --- /dev/null +++ b/evaluation/tests/interface_tests/test-exclam.sh @@ -0,0 +1,3 @@ +#!/bin/sh +echo "!" + diff --git a/evaluation/tests/interface_tests/test-star.sh b/evaluation/tests/interface_tests/test-star.sh new file mode 100644 index 000000000..73c144108 --- /dev/null +++ b/evaluation/tests/interface_tests/test-star.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +printf '%s\n' "$@" +printf '%s\n' "$@" + +echo "$#" +echo "$@" +echo "$1" +echo "$2" +echo "$3" +echo "$4" +echo "$5" diff --git a/evaluation/tests/minimal_grep_stdin_test.in b/evaluation/tests/minimal_grep_stdin_test.in index e4f85a218..c4d2ac9e4 100755 --- a/evaluation/tests/minimal_grep_stdin_test.in +++ b/evaluation/tests/minimal_grep_stdin_test.in @@ -1 +1 @@ -../evaluation/tests/input/1M.txt +./input/1M.txt diff --git a/evaluation/tests/sed-test.sh b/evaluation/tests/sed-test.sh index f5ba0ac85..38d1cc855 100644 --- a/evaluation/tests/sed-test.sh +++ b/evaluation/tests/sed-test.sh @@ -1,11 +1,11 @@ cat $PASH_TOP/evaluation/tests/input/1M.txt | sed 's;^d;da;' | - sed 's;^;http://ndr.md/data/noaa/;' | + sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' | sed 's;$;/;' | sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | - sed 's;^;http://ndr.md/data/noaa/;' | + sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' | sed "s#^#$WIKI#" | sed s/\$/'0s'/ | sed 1d | sed 4d | - sed "\$d" \ No newline at end of file + sed "\$d" diff --git a/evaluation/tests/shortest_scripts.sh b/evaluation/tests/shortest_scripts.sh index 0d3913119..7321d775e 100644 --- a/evaluation/tests/shortest_scripts.sh +++ b/evaluation/tests/shortest_scripts.sh @@ -4,4 +4,6 @@ # +p.95 multiple sed # +p.XX crawler -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15 +# cut -d: -f1 -> cut -d : -f 1; as parser recognizes option arguments only if given with whitespace +# head -15 -> head -n 15; not documented in man page +cat $IN | xargs file | grep "shell script" | cut -d : -f 1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -n 15 diff --git a/evaluation/tests/test_evaluation_scripts.sh b/evaluation/tests/test_evaluation_scripts.sh new file mode 100755 index 000000000..b3c6731de --- /dev/null +++ b/evaluation/tests/test_evaluation_scripts.sh @@ -0,0 +1,229 @@ +#!/bin/bash +# time: print real in seconds, to simplify parsing +## Necessary to set PASH_TOP +cd $(dirname $0) +export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)} +export DEBUG=0 +export PASH_LOG=1 +# export DEBUG=1 # Uncomment to print pash output +## Determines whether the experimental pash flags will be tested. +## By default they are not. +export EXPERIMENTAL=0 +for item in $@ +do + if [ "--debug" == "$item" ] || [ "-d" == "$item" ]; then + export DEBUG=1 + fi + if [ "--no-pash-log" == "$item" ]; then + export PASH_LOG=0 + fi + if [ "--experimental" == "$item" ]; then + export EXPERIMENTAL=1 + fi +done + +microbenchmarks_dir="${PASH_TOP}/evaluation/tests" +intermediary_dir="${PASH_TOP}/evaluation/tests/test_intermediary" +test_results_dir="${PASH_TOP}/evaluation/tests/results" +results_time="$test_results_dir/results.time" +results_time_bash=${results_time}_bash +results_time_pash=${results_time}_pash + +echo "Deleting eager intermediate files..." +rm -rf "$test_results_dir" +rm -rf "$intermediary_dir" +mkdir -p $intermediary_dir +mkdir -p "$test_results_dir" + +echo "Generating inputs..." +cd "$microbenchmarks_dir/input" +./setup.sh +cd - + +n_inputs=( + 2 + 8 +) + +if [ "$EXPERIMENTAL" -eq 1 ]; then + configurations=( + # "" # Commenting this out since the tests take a lot of time to finish + "--parallel_pipelines" + ) +else + configurations=( + "--parallel_pipelines --profile_driven" + ) +fi + + +## Tests where the compiler will not always succeed (e.g. because they have mkfifo) +script_microbenchmarks=( + diff # (quick-abort) BUG: Might have to do with the named pipes, and the fact that they are reused for parallel and sequential script. + set-diff # TODO: Handle redirection after reduce + export_var_script # Tests whether exported variables in the scripts that are processed by PaSh runtime are visible to the rest of the script. + comm-par-test # Small comm test to ensure non-parallelizability + comm-par-test2 # Small comm test with input redirection and hyphen + tee_web_index_bug # Tests a tee bug from web index + fun-def # Tests whether PaSh can handle a simple function definition + bigrams # One-liner + spell-grep # Spell variant with `grep -f` instead of `comm` +) + +pipeline_microbenchmarks=( + grep # One-liner + minimal_sort # One-liner + minimal_grep # One-liner + topn # One-liner + wf # One-liner + spell # One-liner + shortest_scripts # One-liner + alt_bigrams # One-liner + deadlock_test # Test to check deadlock prevention using drain_stream + double_sort # Checks maximum peformance gains from split + no_in_script # Tests whether a script can be executed by our infrastructure without having its input in a file called $IN + for_loop_simple # Tests whether PaSh can handle a for loop where the body is parallelizable + minimal_grep_stdin # Tests whether PaSh can handle a script that reads from stdin + micro_10 # A small version of the pipeline above for debugging. + sed-test # Tests all sed occurences in our evaluation to make sure that they work + tr-test # Tests all possible behaviors of tr that exist in our evaluation + grep-test # Tests some interesting grep invocations + ann-agg # Tests custom aggregators in annotations + # # # # micro_1000 # Not being run anymore, as it is very slow. Tests whether the compiler is fast enough. It is a huge pipeline without any computation. +) + + + +execute_pash_and_check_diff() { + TIMEFORMAT="%3R" # %3U %3S" + if [ "$DEBUG" -eq 1 ]; then + { time "$PASH_TOP/pa.sh" $@ ; } 1> "$pash_output" 2> >(tee -a "${pash_time}" >&2) && + diff -s "$seq_output" "$pash_output" | head | tee -a "${pash_time}" >&2 + else + + { time "$PASH_TOP/pa.sh" $@ ; } 1> "$pash_output" 2>> "${pash_time}" && + b=$(cat "$pash_time"); + test_diff_ec=$(cmp -s "$seq_output" "$pash_output" && echo 0 || echo 1) + # differ + script=$(basename $script_to_execute) + if [ $test_diff_ec -ne 0 ]; then + c=$(diff -s "$seq_output" "$pash_output" | head) + echo "$c$b" > "${pash_time}" + echo "$script are not identical" >> $test_results_dir/result_status + else + echo "Files $seq_output and $pash_output are identical" > "${pash_time}" + echo "$script are identical" >> $test_results_dir/result_status + fi + + fi +} + +execute_tests() { + assert_correctness="$1" + microbenchmarks=("${@:2}") + + microbenchmark_configs=( ) + for i in "${!microbenchmarks[@]}"; do + all_flags=${test_flags[@]} + microbenchmark_configs[$i]="${microbenchmarks[$i]};${all_flags// /;}" + done + + ## This is almost the same loop as the one in execute_evaluation_scripts + for microbenchmark_config in "${microbenchmark_configs[@]}"; do + IFS=";" read -r -a flags <<< "${microbenchmark_config}" + microbenchmark=${flags[0]} + echo "Executing test: $microbenchmark" + # Execute the sequential script on the first run only + + prefix="${microbenchmarks_dir}/${microbenchmark}" + + export seq_output="${intermediary_dir}/${microbenchmark}_seq_output" + seq_time="$test_results_dir/${microbenchmark}_seq.time" + + export script_to_execute="${prefix}.sh" + env_file="${prefix}_env_test.sh" + funs_file="${prefix}_funs.sh" + input_file="${prefix}_test.in" + + if [ -f "$env_file" ]; then + . $env_file + vars_to_export=$(cut -d= -f1 $env_file) + if [ ! -z "$vars_to_export" ]; then + export $vars_to_export + fi + else + echo "|-- Does not have env file" + fi + + ## Export necessary functions + if [ -f "$funs_file" ]; then + source $funs_file + fi + + ## Redirect the input if there is an input file + stdin_redir="/dev/null" + if [ -f "$input_file" ]; then + stdin_redir="$(cat "$input_file")" + echo "|-- Has input file: $stdin_redir" + fi + + TIMEFORMAT="${microbenchmark%%.*}:%3R" # %3U %3S" + echo -n "|-- Executing the script with bash..." + { time /bin/bash "$script_to_execute" > $seq_output ; } \ + < "$stdin_redir" 2>> "${seq_time}" + echo " exited with $?" + tail -n1 ${seq_time} >> ${results_time_bash} + for conf in "${configurations[@]}"; do + for n_in in "${n_inputs[@]}"; do + echo "|-- Executing with pash --width ${n_in} ${conf}..." + export pash_time="${test_results_dir}/${microbenchmark}_${n_in}_distr_$(echo ${conf} | tr -d ' ').time" + export pash_output="${intermediary_dir}/${microbenchmark}_${n_in}_pash_output" + export script_conf=${microbenchmark}_${n_in} + echo '' > "${pash_time}" + # do we need to write the PaSh output ? + cat $stdin_redir | + execute_pash_and_check_diff -d $PASH_LOG $assert_correctness ${conf} --width "${n_in}" --output_time $script_to_execute + tail -n1 "${pash_time}" >> "${results_time_pash}_${n_in}" + done + done + done +} + +execute_tests "" "${script_microbenchmarks[@]}" +execute_tests "--assert_compiler_success" "${pipeline_microbenchmarks[@]}" + +#cat ${results_time} | sed 's/,/./' > /tmp/a +#cat /tmp/a | sed 's/@/,/' > ${results_time} + + +if type lsb_release >/dev/null 2>&1 ; then + distro=$(lsb_release -i -s) +elif [ -e /etc/os-release ] ; then + distro=$(awk -F= '$1 == "ID" {print $2}' /etc/os-release) +fi + +distro=$(printf '%s\n' "$distro" | LC_ALL=C tr '[:upper:]' '[:lower:]') +# now do different things depending on distro +case "$distro" in + freebsd*) + # change sed to gsed + sed () { + gsed $@ + } + ;; + *) + ;; +esac + +echo "group,Bash,Pash2,Pash8" > ${results_time} +paste -d'@' $test_results_dir/results.time_* | sed 's\,\.\g' | sed 's\:\,\g' | sed 's\@\,\g' >> ${results_time} + +#echo "Below follow the identical outputs:" +#grep "are identical" "$test_results_dir"/result_status | awk '{print $1}' + +echo "Below follow the non-identical outputs:" +grep "are not identical" "$test_results_dir"/result_status | awk '{print $1}' + +TOTAL_TESTS=$(cat "$test_results_dir"/result_status | wc -l) +PASSED_TESTS=$(grep -c "are identical" "$test_results_dir"/result_status) +echo "Summary: ${PASSED_TESTS}/${TOTAL_TESTS} tests passed." From e2138ad5b3b2eea05147dcc4b596b444718d62f3 Mon Sep 17 00:00:00 2001 From: Zhicheng Huang Date: Wed, 29 Nov 2023 17:09:26 -0500 Subject: [PATCH 2/2] Updated url for distr benchmarks and added demo_spell.sh as a distr benchmark --- .../intro/check-ft-correctness.sh | 27 +++++ .../distr_benchmarks/intro/demo-spell.sh | 16 +++ .../distr_benchmarks/intro/input/.gitignore | 3 + .../distr_benchmarks/intro/input/setup.sh | 53 +++++++++ .../intro/run.distr.faults.sh | 66 +++++++++++ .../distr_benchmarks/nlp/input/setup.sh | 23 +++- .../oneliners/check_ft_correctness.sh | 27 +++++ .../distr_benchmarks/oneliners/input/setup.sh | 53 ++++++--- .../oneliners/run.distr.faults.sh | 106 ++++++++++++++++++ .../distr_benchmarks/oneliners/run.distr.sh | 3 + 10 files changed, 355 insertions(+), 22 deletions(-) create mode 100644 evaluation/distr_benchmarks/intro/check-ft-correctness.sh create mode 100755 evaluation/distr_benchmarks/intro/demo-spell.sh create mode 100644 evaluation/distr_benchmarks/intro/input/.gitignore create mode 100644 evaluation/distr_benchmarks/intro/input/setup.sh create mode 100644 evaluation/distr_benchmarks/intro/run.distr.faults.sh create mode 100644 evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh create mode 100644 evaluation/distr_benchmarks/oneliners/run.distr.faults.sh diff --git a/evaluation/distr_benchmarks/intro/check-ft-correctness.sh b/evaluation/distr_benchmarks/intro/check-ft-correctness.sh new file mode 100644 index 000000000..81d00d634 --- /dev/null +++ b/evaluation/distr_benchmarks/intro/check-ft-correctness.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Specify the folder where the .out files are located +folder="$DISH_TOP/evaluation/distr_benchmarks/intro/outputs" + +# Loop through the files in the folder +num_workers=3 +for script_distr_out in "$folder"/*distr.out; do + # Extract the script name without the extension + script_name=$(basename "$script_distr_out" .distr.out) + for ((i = 1; i <= num_workers; i++)); do + # get the corresponding .faults.$crashed_worker.out file + crashed_worker="worker$i" + script_faults_out="$folder/$script_name.faults_$crashed_worker.out" + + # Perform a diff between the two files + echo "Comparing faults_$crashed_worker.out and distr.out for script $script_name.sh" + if diff -q "$script_faults_out" "$script_distr_out"; then + echo "Outputs are identical" + else + echo "Files are different. Differences are as follows:" + diff -y "$script_faults_out" "$script_distr_out" + fi + echo "-------------------------------------------" + done + +done \ No newline at end of file diff --git a/evaluation/distr_benchmarks/intro/demo-spell.sh b/evaluation/distr_benchmarks/intro/demo-spell.sh new file mode 100755 index 000000000..2872c353f --- /dev/null +++ b/evaluation/distr_benchmarks/intro/demo-spell.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +cd "$(dirname $0)" + +[ -z $PASH_TOP ] && { + echo "PASH_TOP not set, maybe $(git rev-parse --show-toplevel)?" + exit +} +DICT="$DISH_TOP/evaluation/distr_benchmarks/intro/input/sorted_words" +IN=${IN:-/intro/100M.txt} +hdfs dfs -cat -ignoreCrc $IN | + tr A-Z a-z | + tr -cs A-Za-z '\n' | + sort | + uniq | + comm -13 $DICT - diff --git a/evaluation/distr_benchmarks/intro/input/.gitignore b/evaluation/distr_benchmarks/intro/input/.gitignore new file mode 100644 index 000000000..f833c1e37 --- /dev/null +++ b/evaluation/distr_benchmarks/intro/input/.gitignore @@ -0,0 +1,3 @@ +100M.txt +words +sorted_words \ No newline at end of file diff --git a/evaluation/distr_benchmarks/intro/input/setup.sh b/evaluation/distr_benchmarks/intro/input/setup.sh new file mode 100644 index 000000000..e914a08d0 --- /dev/null +++ b/evaluation/distr_benchmarks/intro/input/setup.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} +. "$PASH_TOP/scripts/utils.sh" +cd $(dirname $0) +input_files=("100M.txt") +local_fils=("dict.txt") + +[ "$1" = "-c" ] && rm-files 100M.txt words sorted_words + +hdfs dfs -mkdir -p /intro + +if [ ! -f ./100M.txt ]; then + curl -sf --connect-timeout 10 'ndr.md/data/dummy/100M.txt' > 100M.txt + if [ $? -ne 0 ]; then + # Pipe curl through tac (twice) in order to consume all the output from curl. + # This way, curl can write the whole page and not emit an error code. + curl -fL 'http://www.gutenberg.org/files/2600/2600-0.txt' | tac | tac | head -c 1M > 1M.txt + [ $? -ne 0 ] && eexit 'cannot find 1M.txt' + touch 100M.txt + for (( i = 0; i < 100; i++ )); do + cat 1M.txt >> 100M.txt + done + fi + append_nl_if_not ./100M.txt +fi + +if [ ! -f ./words ]; then + curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words + if [ $? -ne 0 ]; then + curl -sf 'https://zenodo.org/record/7650885/files/words' > words + if [ $? -ne 0 ]; then + if [ $(uname) = 'Darwin' ]; then + cp /usr/share/dict/web2 words || eexit "cannot find dict file" + else + # apt install wamerican-insane + cp /usr/share/dict/words words || eexit "cannot find dict file" + fi + fi + fi + append_nl_if_not words +fi + +## Re-sort words for this machine +if [ ! -f ./sorted_words ]; then + sort words > sorted_words +fi + +# Add files with different replication factors +for file in "${input_files[@]}"; do + hdfs dfs -put $file /intro/$file + rm -f $file +done \ No newline at end of file diff --git a/evaluation/distr_benchmarks/intro/run.distr.faults.sh b/evaluation/distr_benchmarks/intro/run.distr.faults.sh new file mode 100644 index 000000000..e292bf3d5 --- /dev/null +++ b/evaluation/distr_benchmarks/intro/run.distr.faults.sh @@ -0,0 +1,66 @@ +PASH_FLAGS='--width 8 --r_split' +export TIMEFORMAT=%R +export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt" +curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict + + +intro_pash(){ + flags=${1:-$PASH_FLAGS} + prefix=${2:-par} + prefix=$prefix + + times_file="$prefix.res" + outputs_suffix="$prefix.out" + time_suffix="$prefix.time" + outputs_dir="outputs" + pash_logs_dir="pash_logs_$prefix" + + mkdir -p "$outputs_dir" + mkdir -p "$pash_logs_dir" + + touch "$times_file" + cat $times_file >> $times_file.d + echo executing one-liners with $prefix pash with data $(date) | tee "$times_file" + echo '' >> "$times_file" + + + script="demo-spell" + + + printf -v pad %30s + padded_script="${script}.sh:${pad}" + padded_script=${padded_script:0:30} + + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + pash_log="${pash_logs_dir}/${script}.pash.log" + single_time_file="${outputs_dir}/${script}.${time_suffix}" + + echo -n "${padded_script}" | tee -a "$times_file" + { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" + cat "${single_time_file}" | tee -a "$times_file" + +} + +intro_faults() { + # For faults, mock crash for all workers + num_workers=3 + # it's important to set the timeout long enough for now to avoid the "crashed" worker coming back alive while its replacement does work + # until it's fully supported! + timeout=100 + + for ((i = 1; i <= num_workers; i++)); do + crashed_worker="worker$i" + echo Mocking crash for $crashed_worker with timeout of $timeout seconds + echo ---------------------------------------------------------------- + intro_pash "$PASH_FLAGS --distributed_exec --worker_timeout 100 --worker_timeout_choice worker$i" "faults_$crashed_worker" + # echo "Iteration $i" + # Your loop body here + done +} + +outputs_dir="outputs" +rm -rf "$outputs" + +intro_pash "$PASH_FLAGS --distributed_exec" "distr" + +intro_faults diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh index e523d21a8..380739fc5 100755 --- a/evaluation/distr_benchmarks/nlp/input/setup.sh +++ b/evaluation/distr_benchmarks/nlp/input/setup.sh @@ -19,7 +19,7 @@ if [ ! -e ./pg ]; then cd pg if [[ "$1" == "--full" ]]; then echo 'N.b.: download/extraction will take about 10min' - wget ndr.md/data/pg.tar.xz + wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon if [ $? -ne 0 ]; then cat <<-'EOF' | sed 's/^ *//' Downloading input dataset failed, thus need to manually rsync all books from project gutenberg: @@ -31,10 +31,21 @@ if [ ! -e ./pg ]; then cat pg.tar.xz | tar -xJ else - wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip - unzip nlp.zip - mv data/* . - rm nlp.zip data -rf + # wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip + # unzip nlp.zip + # mv data/* . + # rm nlp.zip data -rf + + # Mock 1 + for (( i = 0; i < 60; i++ )); do + touch "$i".txt + cat ../genesis >> "$i".txt + done + # Mock 2 + for (( i = 61; i < 120; i++ )); do + touch "$i".txt + cat ../exodus >> "$i".txt + done fi for f in *.txt; do @@ -48,4 +59,4 @@ fi hdfs dfs -mkdir /nlp hdfs dfs -put exodus /nlp/exodus hdfs dfs -put genesis /nlp/genesis -hdfs dfs -put pg /nlp/pg +hdfs dfs -put pg /nlp diff --git a/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh b/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh new file mode 100644 index 000000000..5d9faa5d6 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Specify the folder where the .out files are located +folder="$DISH_TOP/evaluation/distr_benchmarks/oneliners/outputs" + +# Loop through the files in the folder +num_workers=3 +for script_distr_out in "$folder"/*distr.out; do + # Extract the script name without the extension + script_name=$(basename "$script_distr_out" .distr.out) + for ((i = 1; i <= num_workers; i++)); do + # get the corresponding .faults.$crashed_worker.out file + crashed_worker="worker$i" + script_faults_out="$folder/$script_name.faults_$crashed_worker.out" + + # Perform a diff between the two files + echo "Comparing faults_$crashed_worker.out and distr.out for script $script_name.sh" + if diff -q "$script_faults_out" "$script_distr_out"; then + echo "Outputs are identical" + else + echo "Files are different. Differences are as follows:" + diff -y "$script_faults_out" "$script_distr_out" + fi + echo "-------------------------------------------" + done + +done \ No newline at end of file diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh index a24725912..c9078d477 100755 --- a/evaluation/distr_benchmarks/oneliners/input/setup.sh +++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh @@ -1,7 +1,9 @@ #!/bin/bash #set -e -PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} +PASH_TOP=${PASH_TOP:-$DISH_TOP/pash} +. "$PASH_TOP/scripts/utils.sh" + # another solution for capturing HTTP status code # https://superuser.com/a/590170 @@ -13,14 +15,15 @@ if [[ "$1" == "-c" ]]; then exit fi -hdfs dfs -mkdir /oneliners +hdfs dfs -mkdir -p /oneliners if [ ! -f ./1M.txt ]; then - curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt + curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt if [ $? -ne 0 ]; then - echo 'cannot find 1M.txt -- please contact the developers of pash' - exit 1 + curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt + [ $? -ne 0 ] && eexit 'cannot find 1M.txt' fi + append_nl_if_not ./1M.txt fi if [ ! -f ./10M.txt ]; then @@ -38,35 +41,53 @@ if [ ! -f ./100M.txt ]; then fi if [ ! -f ./1G.txt ]; then - curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt + curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt + if [ $? -ne 0 ]; then + touch 1G.txt + for (( i = 0; i < 10; i++ )); do + cat 100M.txt >> 1G.txt + done + fi +fi + +if [ ! -f ./words ]; then + curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words + if [ $? -ne 0 ]; then + curl -f 'https://zenodo.org/record/7650885/files/words' > words if [ $? -ne 0 ]; then - echo 'cannot find 1G.txt -- please contact the developers of pash' - exit 1 + if [ $(uname) = 'Darwin' ]; then + cp /usr/share/dict/web2 words || eexit "cannot find dict file" + else + # apt install wamerican-insane + cp /usr/share/dict/words words || eexit "cannot find dict file" + fi fi + fi + append_nl_if_not words fi # download wamerican-insane dictionary and sort according to machine if [ ! -f ./dict.txt ]; then - curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt + curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt if [ $? -ne 0 ]; then - echo 'cannot find dict.txt -- please contact the developers of pash' - exit 1 + sort words > sorted_words fi fi if [ ! -f ./all_cmds.txt ]; then - curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt + curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt if [ $? -ne 0 ]; then # This should be OK for tests, no need for abort ls /usr/bin/* > all_cmds.txt fi + append_nl_if_not ./all_cmds.txt fi if [ ! -f ./all_cmdsx100.txt ]; then - touch all_cmdsx100.txt - for (( i = 0; i < 100; i++ )); do - cat all_cmds.txt >> all_cmdsx100.txt - done + touch all_cmdsx100.txt + for (( i = 0; i < 100; i++ )); do + cat all_cmds.txt >> all_cmdsx100.txt + done fi if [ ! -f ./3G.txt ]; then diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.faults.sh b/evaluation/distr_benchmarks/oneliners/run.distr.faults.sh new file mode 100644 index 000000000..17bbbcdd4 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/run.distr.faults.sh @@ -0,0 +1,106 @@ +PASH_FLAGS='--width 8 --r_split' +export TIMEFORMAT=%R +export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt" +curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict + + +scripts_inputs=( + "nfa-regex;1G.txt" + "sort;3G.txt" + "top-n;3G.txt" + "wf;3G.txt" + "spell;3G.txt" + "diff;3G.txt" + "bi-grams;3G.txt" + "set-diff;3G.txt" + "sort-sort;3G.txt" + "shortest-scripts;all_cmdsx100.txt" + ) + +# scripts_num_subgraphs=( +# "nfa-regex;1" +# "sort;1" +# "top-n;1" +# "wf;1" +# "spell;1" +# "diff;2" +# "bi-grams;1" +# "set-diff;2" +# "sort-sort;1" +# "shortest-scripts;1" +# ) +# declare -A num_subgraphs_map + +# # Populate the associative array +# for num_subgraph in "${scripts_num_subgraphs[@]}"; do +# IFS=";" read -r -a subgraph_info <<< "$num_subgraph" +# script_name="${subgraph_info[0]}" +# num_subgraphs="${subgraph_info[1]}" +# num_subgraphs_map["$script_name"]=$num_subgraphs +# done + +oneliners_pash(){ + flags=${1:-$PASH_FLAGS} + prefix=${2:-par} + prefix=$prefix + + times_file="$prefix.res" + outputs_suffix="$prefix.out" + time_suffix="$prefix.time" + outputs_dir="outputs" + pash_logs_dir="pash_logs_$prefix" + + mkdir -p "$outputs_dir" + mkdir -p "$pash_logs_dir" + + touch "$times_file" + cat $times_file >> $times_file.d + echo executing one-liners with $prefix pash with data $(date) | tee "$times_file" + echo '' >> "$times_file" + + for script_input in ${scripts_inputs[@]} + do + IFS=";" read -r -a script_input_parsed <<< "${script_input}" + script="${script_input_parsed[0]}" + input="${script_input_parsed[1]}" + + export IN="/oneliners/$input" + export dict= + + printf -v pad %30s + padded_script="${script}.sh:${pad}" + padded_script=${padded_script:0:30} + + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + pash_log="${pash_logs_dir}/${script}.pash.log" + single_time_file="${outputs_dir}/${script}.${time_suffix}" + + echo -n "${padded_script}" | tee -a "$times_file" + { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" + cat "${single_time_file}" | tee -a "$times_file" + done +} + +oneliners_faults() { + # For faults, mock crash for all workers + num_workers=3 + # it's important to set the timeout long enough for now to avoid the "crashed" worker coming back alive while its replacement does work + # until it's fully supported! + timeout=100 + + for ((i = 1; i <= num_workers; i++)); do + crashed_worker="worker$i" + echo Mocking crash for $crashed_worker with timeout of $timeout seconds + echo ---------------------------------------------------------------- + oneliners_pash "$PASH_FLAGS --distributed_exec --worker_timeout 100 --worker_timeout_choice worker$i" "faults_$crashed_worker" + # echo "Iteration $i" + # Your loop body here + done +} + +outputs_dir="outputs" +rm -rf "$outputs" + +oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" + +oneliners_faults diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh index 680c31797..83ac0d555 100755 --- a/evaluation/distr_benchmarks/oneliners/run.distr.sh +++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh @@ -120,6 +120,9 @@ oneliners_hadoopstreaming(){ mv "hadoop-streaming/$times_file" . } +outputs_dir="outputs" +rm -rf "$outputs" + oneliners_bash oneliners_pash "$PASH_FLAGS" "par"