From 351f55968f72384a1f59a898a1e81fe620a344de Mon Sep 17 00:00:00 2001
From: Zhicheng Huang <skyhuang@umich.edu>
Date: Wed, 29 Nov 2023 16:56:40 -0500
Subject: [PATCH 1/2] Uprev benchmark links from data branch

---
 .../benchmarks/bio/bio-align/genome-diff.sh   |   2 +-
 .../benchmarks/bio/bio-align/genquality.sh    |   2 +-
 evaluation/benchmarks/bio/bio1/setup.sh       |   2 +-
 .../max-temp/max-temp-preprocess.sh           |   4 +-
 evaluation/benchmarks/max-temp/max-temp.sh    |   2 +-
 .../benchmarks/max-temp/temp-analytics.sh     |   2 +-
 evaluation/benchmarks/nlp/input/setup.sh      |   2 +-
 .../benchmarks/oneliners/input/setup.sh       |   8 +-
 .../benchmarks/web-index/input/setup.sh       |   3 +-
 evaluation/intro/README.md                    |   5 +
 evaluation/intro/input/setup.sh               |  26 +-
 evaluation/intro/test.sh                      |   4 +-
 evaluation/osdi22-eval/run_all.sh             |  41 +---
 evaluation/other/circular/sq.sh               |   1 +
 evaluation/other/more-scripts/page-count.sh   |   2 +-
 evaluation/other/more-scripts/spell.sh        |   2 +-
 evaluation/tests/input/setup.sh               |  29 ++-
 evaluation/tests/interface_tests/env_vars.sh  |   9 +
 evaluation/tests/interface_tests/redir-dup.sh |   3 +
 .../tests/interface_tests/redir-var-test.sh   |  10 +
 evaluation/tests/interface_tests/run.sh       |  45 +++-
 .../tests/interface_tests/test-exclam.sh      |   3 +
 evaluation/tests/interface_tests/test-star.sh |  12 +
 evaluation/tests/minimal_grep_stdin_test.in   |   2 +-
 evaluation/tests/sed-test.sh                  |   6 +-
 evaluation/tests/shortest_scripts.sh          |   4 +-
 evaluation/tests/test_evaluation_scripts.sh   | 229 ++++++++++++++++++
 27 files changed, 372 insertions(+), 88 deletions(-)
 create mode 100644 evaluation/intro/README.md
 create mode 100644 evaluation/tests/interface_tests/env_vars.sh
 create mode 100644 evaluation/tests/interface_tests/redir-dup.sh
 create mode 100644 evaluation/tests/interface_tests/redir-var-test.sh
 create mode 100755 evaluation/tests/interface_tests/test-exclam.sh
 create mode 100644 evaluation/tests/interface_tests/test-star.sh
 create mode 100755 evaluation/tests/test_evaluation_scripts.sh

diff --git a/evaluation/benchmarks/bio/bio-align/genome-diff.sh b/evaluation/benchmarks/bio/bio-align/genome-diff.sh
index a269f9e95..c82061797 100755
--- a/evaluation/benchmarks/bio/bio-align/genome-diff.sh
+++ b/evaluation/benchmarks/bio/bio-align/genome-diff.sh
@@ -11,7 +11,7 @@
 # bacteria), and any regions with less than 10 supporting reads.
 
 # Requires: samtools, minimap2, bcftools
-# Data: http://ndr.md/data/bio/R1.fastq.gz http://ndr.md/data/bio/R2.fastq.gz  http://ndr.md/data/bio/ref.fa
+# Data: atlas-group.cs.brown.edu/data/bio/R1.fastq.gz atlas-group.cs.brown.edu/data/bio/R2.fastq.gz  atlas-group.cs.brown.edu/data/bio/ref.fa
 
 # https://github.com/samtools/samtools/releases/latest
 # https://github.com/lh3/minimap2
diff --git a/evaluation/benchmarks/bio/bio-align/genquality.sh b/evaluation/benchmarks/bio/bio-align/genquality.sh
index 64c777fdd..62c731960 100755
--- a/evaluation/benchmarks/bio/bio-align/genquality.sh
+++ b/evaluation/benchmarks/bio/bio-align/genquality.sh
@@ -6,7 +6,7 @@
 # http://thegenomefactory.blogspot.com/2019/09/25-reasons-assemblies-dont-make-it-into.html
 
 # Require: csvkit
-# Data: http://ndr.md/data/bio/genbank.txt
+# Data: atlas-group.cs.brown.edu/data/bio/genbank.txt
 
 IN=./input/genbank.txt
 OUT=./output/out.txt
diff --git a/evaluation/benchmarks/bio/bio1/setup.sh b/evaluation/benchmarks/bio/bio1/setup.sh
index 40bdd47cb..9c2bb1629 100644
--- a/evaluation/benchmarks/bio/bio1/setup.sh
+++ b/evaluation/benchmarks/bio/bio1/setup.sh
@@ -8,7 +8,7 @@ mkdir -p input
 mkdir -p output
 cd input
 if [[ ! -f R1.fastq ]]; then
-  wget ndr.md/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}
+  wget atlas-group.cs.brown.edu/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}
 
   gunzip R1.fastq.gz
   gunzip R2.fastq.gz
diff --git a/evaluation/benchmarks/max-temp/max-temp-preprocess.sh b/evaluation/benchmarks/max-temp/max-temp-preprocess.sh
index e3d4b98c5..8d0719049 100755
--- a/evaluation/benchmarks/max-temp/max-temp-preprocess.sh
+++ b/evaluation/benchmarks/max-temp/max-temp-preprocess.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 
-sed 's;^;http://ndr.md/data/noaa/;' |
+sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
     sed 's;$;/;' |
     xargs -r -n 1 curl -s |
     grep gz |
     tr -s ' \n' |
     cut -d ' ' -f9 |
     sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
-    sed 's;^;http://ndr.md/data/noaa/;' |
+    sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
     xargs -n1 curl -s |
     gunzip
diff --git a/evaluation/benchmarks/max-temp/max-temp.sh b/evaluation/benchmarks/max-temp/max-temp.sh
index b0c18aaa8..b74f72b10 100755
--- a/evaluation/benchmarks/max-temp/max-temp.sh
+++ b/evaluation/benchmarks/max-temp/max-temp.sh
@@ -2,7 +2,7 @@
 
 FROM=${FROM:-2015}
 TO=${TO:-2015}
-IN=${IN:-'http://ndr.md/data/noaa/'}
+IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
 fetch=${fetch:-"curl -s"}
 
 seq $FROM $TO |
diff --git a/evaluation/benchmarks/max-temp/temp-analytics.sh b/evaluation/benchmarks/max-temp/temp-analytics.sh
index 319a8f0e4..a1399fa7d 100755
--- a/evaluation/benchmarks/max-temp/temp-analytics.sh
+++ b/evaluation/benchmarks/max-temp/temp-analytics.sh
@@ -2,7 +2,7 @@
 
 FROM=${FROM:-2015}
 TO=${TO:-2015}
-IN=${IN:-'http://ndr.md/data/noaa/'}
+IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
 fetch=${fetch:-"curl -s"}
 
 data_file=temperatures.txt
diff --git a/evaluation/benchmarks/nlp/input/setup.sh b/evaluation/benchmarks/nlp/input/setup.sh
index 5486b39f2..a26a9cf19 100755
--- a/evaluation/benchmarks/nlp/input/setup.sh
+++ b/evaluation/benchmarks/nlp/input/setup.sh
@@ -20,7 +20,7 @@ setup_dataset() {
     cd pg
   if [[ "$1" == "--full" ]]; then
     echo 'N.b.: download/extraction will take about 10min'
-    wget ndr.md/data/pg.tar.xz
+    wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
     if [ $? -ne 0 ]; then
 		cat <<-'EOF' | sed 's/^ *//'
 		Downloading input dataset failed, thus need to manually rsync all books from  project gutenberg:
diff --git a/evaluation/benchmarks/oneliners/input/setup.sh b/evaluation/benchmarks/oneliners/input/setup.sh
index 96388980d..eb8a00317 100755
--- a/evaluation/benchmarks/oneliners/input/setup.sh
+++ b/evaluation/benchmarks/oneliners/input/setup.sh
@@ -26,7 +26,7 @@ setup_dataset() {
   fi
 
     if [ ! -f ./1M.txt ]; then
-        curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
+        curl -sf 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
         if [ $? -ne 0 ]; then
             echo 'cannot find 1M.txt -- please contact the developers of pash'
             exit 1
@@ -51,7 +51,7 @@ setup_dataset() {
     fi
 
     if [ ! -f ./1G.txt ]; then
-        curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
+        curl -sf 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
         if [ $? -ne 0 ]; then
             echo 'cannot find 1G.txt -- please contact the developers of pash'
             exit 1
@@ -61,7 +61,7 @@ setup_dataset() {
 
   # download wamerican-insane dictionary and sort according to machine
   if [ ! -f ./dict.txt ]; then
-      curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
+      curl -sf 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
       if [ $? -ne 0 ]; then
           echo 'cannot find dict.txt -- please contact the developers of pash'
           exit 1
@@ -70,7 +70,7 @@ setup_dataset() {
     fi
 
     if [ ! -f ./all_cmds.txt ]; then
-        curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
+        curl -sf 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
         if [ $? -ne 0 ]; then
             # This should be OK for tests, no need for abort
             ls /usr/bin/* > all_cmds.txt
diff --git a/evaluation/benchmarks/web-index/input/setup.sh b/evaluation/benchmarks/web-index/input/setup.sh
index 72a4fd8f9..79a77276a 100755
--- a/evaluation/benchmarks/web-index/input/setup.sh
+++ b/evaluation/benchmarks/web-index/input/setup.sh
@@ -17,8 +17,7 @@ setup_dataset() {
     wget $wiki_archive || eexit "cannot fetch wikipedia"
     7za x wikipedia-en-html.tar.7z
     tar -xvf wikipedia-en-html.tar
-    wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices"
-    # It is actually OK if we don't have this index since we download the 500/1000 below
+    wget atlas-group.cs.brown.edu/data/wikipedia/index.txt # FIXME: we download index below?
   fi
 
   if [ "$1" = "--small" ]; then
diff --git a/evaluation/intro/README.md b/evaluation/intro/README.md
new file mode 100644
index 000000000..194e1bf58
--- /dev/null
+++ b/evaluation/intro/README.md
@@ -0,0 +1,5 @@
+To create the input files needed for evaluation, run:
+
+```bash
+./input/setup.sh
+```
diff --git a/evaluation/intro/input/setup.sh b/evaluation/intro/input/setup.sh
index a524e9e56..c2eaa684d 100755
--- a/evaluation/intro/input/setup.sh
+++ b/evaluation/intro/input/setup.sh
@@ -6,27 +6,33 @@ cd $(dirname $0)
 
 [ "$1" = "-c" ] && rm-files 100M.txt words sorted_words
 
+
 if [ ! -f ./100M.txt ]; then
-  curl -f 'ndr.md/data/dummy/100M.txt' > 100M.txt
+  curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/100M.txt' > 100M.txt
   if [ $? -ne 0 ]; then
-    curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt
+    # Pipe curl through tac (twice) in order to consume all the output from curl.
+    # This way, curl can write the whole page and not emit an error code.
+    curl -fL 'http://www.gutenberg.org/files/2600/2600-0.txt' | tac | tac | head -c 1M > 1M.txt
     [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
     touch 100M.txt
-    for (( i = 0; i < 10; i++ )); do
-      cat 1M.txt >> 10M.txt
+    for (( i = 0; i < 100; i++ )); do
+      cat 1M.txt >> 100M.txt
     done
   fi
   append_nl_if_not ./100M.txt
 fi
 
 if [ ! -f ./words ]; then
-  curl -f 'http://ndr.md/data/dummy/words' > words
+  curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words
   if [ $? -ne 0 ]; then
-    if [ $(uname) = 'Darwin' ]; then
-      cp /usr/share/dict/web2 words || eexit "cannot find dict file"
-    else
-      # apt install wamerican-insane
-      cp /usr/share/dict/words words || eexit "cannot find dict file"
+    curl -sf 'https://zenodo.org/record/7650885/files/words' > words
+    if [ $? -ne 0 ]; then
+      if [ $(uname) = 'Darwin' ]; then
+        cp /usr/share/dict/web2 words || eexit "cannot find dict file"
+      else
+        # apt install wamerican-insane
+        cp /usr/share/dict/words words || eexit "cannot find dict file"
+      fi
     fi
   fi
   append_nl_if_not words
diff --git a/evaluation/intro/test.sh b/evaluation/intro/test.sh
index a72232832..3563fb315 100755
--- a/evaluation/intro/test.sh
+++ b/evaluation/intro/test.sh
@@ -17,10 +17,10 @@ run_test()
     local test=$1
     echo -n "Running $test..."
     TIMEFORMAT="${test%%.*}:%3R" # %3U %3S"
-    { time $bash "$test" > "$output_dir/$test.bash.out"; } 2>>  $output_dir/results.time_bash
+    { time $bash "$test" > "$output_dir/$test.bash.out"; } 2> >(tee -a $output_dir/results.time_bash)
     test_bash_ec=$?
     TIMEFORMAT="%3R" # %3U %3S"
-    { time $pash "$test" > "$output_dir/$test.pash.out"; } 2>>  $output_dir/results.time_pash
+    { time $pash "$test" > "$output_dir/$test.pash.out"; } 2> >(tee -a $output_dir/results.time_pash)
     test_pash_ec=$?
     diff "$output_dir/$test.bash.out" "$output_dir/$test.pash.out"
     test_diff_ec=$?
diff --git a/evaluation/osdi22-eval/run_all.sh b/evaluation/osdi22-eval/run_all.sh
index 2c9379ae9..d6d26f032 100755
--- a/evaluation/osdi22-eval/run_all.sh
+++ b/evaluation/osdi22-eval/run_all.sh
@@ -59,46 +59,7 @@ run_bench() {
     done
 }
 
-function run_comm_du_benchmarks() {
-  # generate output folder for each run
-  export RES_FOLDER=$1
-  # clean previous runs
-  rm -rf ${RES_FOLDER}
-  mkdir -p ${RES_FOLDER}
-  cd ${PASH_TOP}/evaluation/benchmarks
-  # remove all res files from previous runs
-  find . -type d -name "outputs" | xargs rm -rf
-  # do not remove any input from the node_modules dataset
-  find . -type d -not -path "*/node_modules/*" -name "output" | xargs rm -rf 
-  find . -type d -name "pash_logs" | xargs rm -rf
-  find . -type f -name "*.res" | xargs rm -f
-  export PASH_BENCHMARK=("oneliners" "unix50" "analytics-mts" "nlp" "max-temp" "dependency_untangling")
-
-  echo 'Running all benchmark for bash'
-  time run_bash
-  
-  echo 'Running commutativity benchmarks'
-  export PASH_ALL_FLAGS=("--dgsh_tee --width 16" 
-                         "--dgsh_tee --r_split --width 16" )
-  export PASH_BENCHMARK=( "oneliners" "unix50" "analytics-mts" "max-temp")
-  export PASH_MODE=( "disabled_commutativity" 
-                    "enabled_commutativity" )
-  time run_bench
-
-  echo 'Running dependency untangling benchmarks'
-  export PASH_ALL_FLAGS=("--r_split --dgsh_tee " 
-                         "--r_split --dgsh_tee --parallel_pipelines" )
-  export PASH_BENCHMARK=( "nlp" "dependency_untangling" )
-  export PASH_MODE=( "disabled_dependency_untangling"
-                     "enabled_dependency_untangling" )
-
-  time run_bench 
-
-  # kill the hanging processes 
-  pkill -f cat
-}
-
-function run_all_benchmarks() {
+run_all_benchmarks() {
   # generate output folder for each run
   export RES_FOLDER=$1
   # clean previous runs
diff --git a/evaluation/other/circular/sq.sh b/evaluation/other/circular/sq.sh
index 89520e695..bce2a72cd 100755
--- a/evaluation/other/circular/sq.sh
+++ b/evaluation/other/circular/sq.sh
@@ -2,6 +2,7 @@
 # Clever trick that uses the /dev/fd/xx pseudo-file system
 # https://stackoverflow.com/questions/40244/how-to-make-a-pipe-loop-in-bash
 
+# MMG 2022-06-30 the `function` kw is a bash-ism; leaving it in to not disrupt what gets optimized in previous evaluations
 function calc() {
   # calculate sum of squares of numbers 0,..,10
 
diff --git a/evaluation/other/more-scripts/page-count.sh b/evaluation/other/more-scripts/page-count.sh
index b4a3326e5..c4d89ecfd 100755
--- a/evaluation/other/more-scripts/page-count.sh
+++ b/evaluation/other/more-scripts/page-count.sh
@@ -5,7 +5,7 @@
 
 # Require: libimage-exiftool-perl, bc
 # Data:
-#   http://ndr.md/data/dummy/large.pdf
+#   atlas-group.cs.brown.edu/data/large.pdf
 # More data:
 #   https://arxiv.org/help/bulk_data
 
diff --git a/evaluation/other/more-scripts/spell.sh b/evaluation/other/more-scripts/spell.sh
index 1d4a9f330..9fd5e7384 100755
--- a/evaluation/other/more-scripts/spell.sh
+++ b/evaluation/other/more-scripts/spell.sh
@@ -6,7 +6,7 @@
 # TODO: `groff is an interesting "pure", whose wrapper only needs split input
 # TODO: files carefully.
 
-# Data: http://ndr.md/data/dummy/ronn.1
+# Data: atlas-group.cs.brown.edu/data/dummy/ronn.1
 # dict depends on the system (and has to be sorted), so we assume it exists
 dict=./input/dict.txt
 
diff --git a/evaluation/tests/input/setup.sh b/evaluation/tests/input/setup.sh
index ac78afd20..ccc6712fe 100755
--- a/evaluation/tests/input/setup.sh
+++ b/evaluation/tests/input/setup.sh
@@ -16,17 +16,23 @@ esac
 [ "$1" = "-c" ] && rm-files 1M.txt all_cmds.txt words sorted_words 10M.txt
 
 if [ ! -f ./1M.txt ]; then
-  curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
+  curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
   if [ $? -ne 0 ]; then
-    curl -sf 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1${head_sz} > 1M.txt
-    [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
+    curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt
+    if [ $? -ne 0 ]; then
+      curl -sf 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1${head_sz} > 1M.txt
+      [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
+    fi
   fi
   append_nl_if_not ./1M.txt
 fi
 
 if [ ! -f ./all_cmds.txt ]; then
   if [ "$(hostname)" = "deathstar" ]; then
-    curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
+    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
+    if [ $? -ne 0 ]; then
+      curl -f 'https://zenodo.org/record/7650885/files/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
+    fi
   else
     ls /usr/bin/* > all_cmds.txt
   fi
@@ -34,13 +40,16 @@ if [ ! -f ./all_cmds.txt ]; then
 fi
 
 if [ ! -f ./words ]; then
-  curl -sf 'http://ndr.md/data/dummy/words' > words
+  curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words
   if [ $? -ne 0 ]; then
-    if [ $(uname) = 'Darwin' ]; then
-      cp /usr/share/dict/web2 words || eexit "cannot find dict file"
-    else
-      # apt install wamerican-insane
-      cp /usr/share/dict/words words || eexit "cannot find dict file"
+    curl -f 'https://zenodo.org/record/7650885/files/words' > words
+    if [ $? -ne 0 ]; then
+      if [ $(uname) = 'Darwin' ]; then
+        cp /usr/share/dict/web2 words || eexit "cannot find dict file"
+      else
+        # apt install wamerican-insane
+        cp /usr/share/dict/words words || eexit "cannot find dict file"
+      fi
     fi
   fi
   append_nl_if_not words
diff --git a/evaluation/tests/interface_tests/env_vars.sh b/evaluation/tests/interface_tests/env_vars.sh
new file mode 100644
index 000000000..784a4ae6d
--- /dev/null
+++ b/evaluation/tests/interface_tests/env_vars.sh
@@ -0,0 +1,9 @@
+myfunction() {
+    env | sort > tmp1.txt
+}
+shellvar1=123456
+shellvar2="This is several words"
+shellvar3="                        xxx                  "
+export shellvar2
+trap myfunction EXIT
+env | sort > tmp2.txt
diff --git a/evaluation/tests/interface_tests/redir-dup.sh b/evaluation/tests/interface_tests/redir-dup.sh
new file mode 100644
index 000000000..107d956b0
--- /dev/null
+++ b/evaluation/tests/interface_tests/redir-dup.sh
@@ -0,0 +1,3 @@
+(echo one >&2) 2>&1
+(echo two >&2) 2>-
+(echo three >&2) 2>&1
diff --git a/evaluation/tests/interface_tests/redir-var-test.sh b/evaluation/tests/interface_tests/redir-var-test.sh
new file mode 100644
index 000000000..e82ffd7cb
--- /dev/null
+++ b/evaluation/tests/interface_tests/redir-var-test.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+func_emit_tests_Makefile_am ()
+{
+  ofd=3
+  {
+    echo hi
+  } >&$ofd
+}
+fd=1 
+echo hi >&$fd
diff --git a/evaluation/tests/interface_tests/run.sh b/evaluation/tests/interface_tests/run.sh
index 6a1a7fb01..e0cd53cf1 100755
--- a/evaluation/tests/interface_tests/run.sh
+++ b/evaluation/tests/interface_tests/run.sh
@@ -4,7 +4,7 @@ export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-
 # time: print real in seconds, to simplify parsing
 
 bash="bash"
-pash="$PASH_TOP/pa.sh --parallel_pipelines --r_split --dgsh_tee --profile_driven"
+pash="$PASH_TOP/pa.sh --parallel_pipelines --profile_driven"
 
 output_dir="$PASH_TOP/evaluation/tests/interface_tests/output"
 rm -rf "$output_dir"
@@ -142,13 +142,10 @@ test14()
     $shell +a readonly.sh
 }
 
-## Checks interactivity
-##
-## TODO: Make the interactivity script more elaborate (variable dependencies)
 test15()
 {
     local shell=$1
-    $shell < readonly.sh 
+    $shell readonly.sh 
 }
 
 test16()
@@ -291,6 +288,39 @@ test_var_assgn_default()
     $shell var_assgn.sh
 }
 
+test_exclam()
+{
+    local shell=$1
+    $shell test-exclam.sh
+}
+
+test_redir_var_test()
+{
+    local shell=$1
+    $shell redir-var-test.sh
+}
+
+test_star()
+{
+    local shell=$1
+    $shell test-star.sh foo '*' baz 'hi michael' "abc
+     dfg"
+}
+
+test_env_vars()
+{
+    local shell=$1
+    rm -f tmp1.txt tmp2.txt
+    $shell env_vars.sh
+    diff tmp1.txt tmp2.txt
+}
+
+test_redir_dup()
+{
+    local shell=$1
+    $shell redir-dup.sh
+}
+
 ## We run all tests composed with && to exit on the first that fails
 if [ "$#" -eq 0 ]; then
     run_test test1
@@ -330,6 +360,11 @@ if [ "$#" -eq 0 ]; then
     run_test test_expand_u_positional
     run_test test_quoting
     run_test test_var_assgn_default
+    run_test test_exclam
+    run_test test_redir_var_test
+    run_test test_star
+    run_test test_env_vars
+    run_test test_redir_dup
 else
     for testname in $@
     do
diff --git a/evaluation/tests/interface_tests/test-exclam.sh b/evaluation/tests/interface_tests/test-exclam.sh
new file mode 100755
index 000000000..8fb0eeeb1
--- /dev/null
+++ b/evaluation/tests/interface_tests/test-exclam.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+echo "!"
+
diff --git a/evaluation/tests/interface_tests/test-star.sh b/evaluation/tests/interface_tests/test-star.sh
new file mode 100644
index 000000000..73c144108
--- /dev/null
+++ b/evaluation/tests/interface_tests/test-star.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+printf '%s\n' "$@"
+printf '%s\n' "$@"
+
+echo "$#"
+echo "$@"
+echo "$1"
+echo "$2"
+echo "$3"
+echo "$4"
+echo "$5"
diff --git a/evaluation/tests/minimal_grep_stdin_test.in b/evaluation/tests/minimal_grep_stdin_test.in
index e4f85a218..c4d2ac9e4 100755
--- a/evaluation/tests/minimal_grep_stdin_test.in
+++ b/evaluation/tests/minimal_grep_stdin_test.in
@@ -1 +1 @@
-../evaluation/tests/input/1M.txt
+./input/1M.txt
diff --git a/evaluation/tests/sed-test.sh b/evaluation/tests/sed-test.sh
index f5ba0ac85..38d1cc855 100644
--- a/evaluation/tests/sed-test.sh
+++ b/evaluation/tests/sed-test.sh
@@ -1,11 +1,11 @@
 cat $PASH_TOP/evaluation/tests/input/1M.txt |
     sed 's;^d;da;' |
-    sed 's;^;http://ndr.md/data/noaa/;' |
+    sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
     sed 's;$;/;' |
     sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
-    sed 's;^;http://ndr.md/data/noaa/;' |
+    sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
     sed "s#^#$WIKI#" |
     sed s/\$/'0s'/ |
     sed 1d |
     sed 4d |
-    sed "\$d"
\ No newline at end of file
+    sed "\$d"
diff --git a/evaluation/tests/shortest_scripts.sh b/evaluation/tests/shortest_scripts.sh
index 0d3913119..7321d775e 100644
--- a/evaluation/tests/shortest_scripts.sh
+++ b/evaluation/tests/shortest_scripts.sh
@@ -4,4 +4,6 @@
 # +p.95 multiple sed
 # +p.XX crawler
 
-cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
+# cut -d: -f1 -> cut -d : -f 1; as parser recognizes option arguments only if given with whitespace
+# head -15 -> head -n 15; not documented in man page 
+cat $IN | xargs file | grep "shell script" | cut -d : -f 1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -n 15
diff --git a/evaluation/tests/test_evaluation_scripts.sh b/evaluation/tests/test_evaluation_scripts.sh
new file mode 100755
index 000000000..b3c6731de
--- /dev/null
+++ b/evaluation/tests/test_evaluation_scripts.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+# time: print real in seconds, to simplify parsing
+## Necessary to set PASH_TOP
+cd $(dirname $0)
+export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}
+export DEBUG=0
+export PASH_LOG=1
+# export DEBUG=1 # Uncomment to print pash output
+## Determines whether the experimental pash flags will be tested. 
+## By default they are not.
+export EXPERIMENTAL=0
+for item in $@
+do
+    if [ "--debug" == "$item" ] || [ "-d" == "$item" ]; then
+        export DEBUG=1
+    fi
+    if [ "--no-pash-log" == "$item" ]; then
+        export PASH_LOG=0
+    fi
+    if [ "--experimental" == "$item" ]; then
+        export EXPERIMENTAL=1
+    fi
+done
+
+microbenchmarks_dir="${PASH_TOP}/evaluation/tests"
+intermediary_dir="${PASH_TOP}/evaluation/tests/test_intermediary"
+test_results_dir="${PASH_TOP}/evaluation/tests/results"
+results_time="$test_results_dir/results.time"
+results_time_bash=${results_time}_bash
+results_time_pash=${results_time}_pash
+
+echo "Deleting eager intermediate files..."
+rm -rf "$test_results_dir"
+rm -rf "$intermediary_dir"
+mkdir -p $intermediary_dir
+mkdir -p "$test_results_dir"
+
+echo "Generating inputs..."
+cd "$microbenchmarks_dir/input"
+./setup.sh
+cd -
+
+n_inputs=(
+    2
+    8
+)
+
+if [ "$EXPERIMENTAL" -eq 1 ]; then
+    configurations=(
+        # "" # Commenting this out since the tests take a lot of time to finish
+        "--parallel_pipelines"
+    )
+else
+    configurations=(
+        "--parallel_pipelines --profile_driven"
+    )
+fi
+
+
+## Tests where the compiler will not always succeed (e.g. because they have mkfifo)
+script_microbenchmarks=(
+    diff                 # (quick-abort) BUG: Might have to do with the named pipes, and the fact that they are reused for parallel and sequential script.
+    set-diff             # TODO: Handle redirection after reduce
+    export_var_script    # Tests whether exported variables in the scripts that are processed by PaSh runtime are visible to the rest of the script.
+    comm-par-test        # Small comm test to ensure non-parallelizability
+    comm-par-test2       # Small comm test with input redirection and hyphen
+    tee_web_index_bug    # Tests a tee bug from web index
+    fun-def              # Tests whether PaSh can handle a simple function definition
+    bigrams              # One-liner
+    spell-grep           # Spell variant with `grep -f` instead of `comm`
+)
+
+pipeline_microbenchmarks=(
+    grep                 # One-liner
+    minimal_sort         # One-liner
+    minimal_grep         # One-liner
+    topn                 # One-liner
+    wf                   # One-liner
+    spell                # One-liner
+    shortest_scripts     # One-liner
+    alt_bigrams          # One-liner
+    deadlock_test        # Test to check deadlock prevention using drain_stream
+    double_sort          # Checks maximum peformance gains from split
+    no_in_script         # Tests whether a script can be executed by our infrastructure without having its input in a file called $IN
+    for_loop_simple      # Tests whether PaSh can handle a for loop where the body is parallelizable
+    minimal_grep_stdin   # Tests whether PaSh can handle a script that reads from stdin
+    micro_10             # A small version of the pipeline above for debugging.
+    sed-test             # Tests all sed occurences in our evaluation to make sure that they work
+    tr-test              # Tests all possible behaviors of tr that exist in our evaluation
+    grep-test            # Tests some interesting grep invocations
+    ann-agg              # Tests custom aggregators in annotations
+    # # # # micro_1000           # Not being run anymore, as it is very slow. Tests whether the compiler is fast enough. It is a huge pipeline without any computation.
+)
+
+
+
+execute_pash_and_check_diff() {
+    TIMEFORMAT="%3R" # %3U %3S"
+    if [ "$DEBUG" -eq 1 ]; then
+        { time "$PASH_TOP/pa.sh" $@ ; } 1> "$pash_output" 2> >(tee -a "${pash_time}" >&2) &&
+        diff -s "$seq_output" "$pash_output" | head | tee -a "${pash_time}" >&2
+    else
+
+        { time "$PASH_TOP/pa.sh" $@ ; } 1> "$pash_output" 2>> "${pash_time}" &&
+        b=$(cat "$pash_time"); 
+        test_diff_ec=$(cmp -s "$seq_output" "$pash_output" && echo 0 || echo 1)
+        # differ
+        script=$(basename $script_to_execute)
+        if [ $test_diff_ec -ne 0 ]; then
+            c=$(diff -s "$seq_output" "$pash_output" | head)
+            echo "$c$b" > "${pash_time}"
+            echo "$script are not identical" >> $test_results_dir/result_status
+        else
+            echo "Files $seq_output and $pash_output are identical" > "${pash_time}"
+            echo "$script are identical" >> $test_results_dir/result_status
+        fi
+
+    fi
+}
+
+execute_tests() {
+    assert_correctness="$1"
+    microbenchmarks=("${@:2}")
+
+    microbenchmark_configs=( )
+    for i in "${!microbenchmarks[@]}"; do
+        all_flags=${test_flags[@]}
+        microbenchmark_configs[$i]="${microbenchmarks[$i]};${all_flags// /;}"
+    done
+
+    ## This is almost the same loop as the one in execute_evaluation_scripts
+    for microbenchmark_config in "${microbenchmark_configs[@]}"; do
+        IFS=";" read -r -a flags <<< "${microbenchmark_config}"
+        microbenchmark=${flags[0]}
+        echo "Executing test: $microbenchmark"
+        # Execute the sequential script on the first run only
+        
+        prefix="${microbenchmarks_dir}/${microbenchmark}"
+
+        export seq_output="${intermediary_dir}/${microbenchmark}_seq_output"
+        seq_time="$test_results_dir/${microbenchmark}_seq.time"
+
+        export script_to_execute="${prefix}.sh"
+        env_file="${prefix}_env_test.sh"
+        funs_file="${prefix}_funs.sh"
+        input_file="${prefix}_test.in"
+
+        if [ -f "$env_file" ]; then
+            . $env_file
+            vars_to_export=$(cut -d= -f1 $env_file)
+            if [ ! -z "$vars_to_export" ]; then
+                export $vars_to_export
+            fi
+        else
+            echo "|-- Does not have env file"
+        fi
+
+        ## Export necessary functions
+        if [ -f "$funs_file" ]; then
+            source $funs_file
+        fi
+
+        ## Redirect the input if there is an input file
+        stdin_redir="/dev/null"
+        if [ -f "$input_file" ]; then
+            stdin_redir="$(cat "$input_file")"
+            echo "|-- Has input file: $stdin_redir"
+        fi
+
+        TIMEFORMAT="${microbenchmark%%.*}:%3R" # %3U %3S"
+        echo -n "|-- Executing the script with bash..."
+        { time /bin/bash "$script_to_execute" > $seq_output ; } \
+            < "$stdin_redir" 2>> "${seq_time}"
+        echo "   exited with $?"
+        tail -n1 ${seq_time} >> ${results_time_bash}
+        for conf in "${configurations[@]}"; do
+            for n_in in "${n_inputs[@]}"; do
+                echo "|-- Executing with pash --width ${n_in} ${conf}..."
+                export pash_time="${test_results_dir}/${microbenchmark}_${n_in}_distr_$(echo ${conf} | tr -d ' ').time"
+                export pash_output="${intermediary_dir}/${microbenchmark}_${n_in}_pash_output"
+                export script_conf=${microbenchmark}_${n_in}
+                echo '' > "${pash_time}"
+                # do we need to write the PaSh output ?
+                cat $stdin_redir |
+                    execute_pash_and_check_diff -d $PASH_LOG $assert_correctness ${conf} --width "${n_in}" --output_time $script_to_execute                 
+                tail -n1 "${pash_time}" >> "${results_time_pash}_${n_in}"
+            done
+        done
+    done
+}
+
+execute_tests "" "${script_microbenchmarks[@]}"
+execute_tests "--assert_compiler_success" "${pipeline_microbenchmarks[@]}"
+
+#cat ${results_time} | sed 's/,/./' > /tmp/a
+#cat /tmp/a | sed 's/@/,/' > ${results_time}
+
+
+if type lsb_release >/dev/null 2>&1 ; then
+   distro=$(lsb_release -i -s)
+elif [ -e /etc/os-release ] ; then
+   distro=$(awk -F= '$1 == "ID" {print $2}' /etc/os-release)
+fi
+
+distro=$(printf '%s\n' "$distro" | LC_ALL=C tr '[:upper:]' '[:lower:]')
+# now do different things depending on distro
+case "$distro" in
+    freebsd*)  
+        # change sed to gsed
+        sed () {
+            gsed $@
+        }
+        ;;
+    *)
+        ;;
+esac
+
+echo "group,Bash,Pash2,Pash8" > ${results_time}
+paste -d'@' $test_results_dir/results.time_*  | sed 's\,\.\g' | sed 's\:\,\g' | sed 's\@\,\g' >> ${results_time}
+
+#echo "Below follow the identical outputs:"
+#grep "are identical" "$test_results_dir"/result_status | awk '{print $1}'
+
+echo "Below follow the non-identical outputs:"     
+grep "are not identical" "$test_results_dir"/result_status | awk '{print $1}'
+
+TOTAL_TESTS=$(cat "$test_results_dir"/result_status | wc -l)
+PASSED_TESTS=$(grep -c "are identical" "$test_results_dir"/result_status)
+echo "Summary: ${PASSED_TESTS}/${TOTAL_TESTS} tests passed."

From e2138ad5b3b2eea05147dcc4b596b444718d62f3 Mon Sep 17 00:00:00 2001
From: Zhicheng Huang <skyhuang@umich.edu>
Date: Wed, 29 Nov 2023 17:09:26 -0500
Subject: [PATCH 2/2] Updated url for distr benchmarks and added demo_spell.sh
 as a distr benchmark

---
 .../intro/check-ft-correctness.sh             |  27 +++++
 .../distr_benchmarks/intro/demo-spell.sh      |  16 +++
 .../distr_benchmarks/intro/input/.gitignore   |   3 +
 .../distr_benchmarks/intro/input/setup.sh     |  53 +++++++++
 .../intro/run.distr.faults.sh                 |  66 +++++++++++
 .../distr_benchmarks/nlp/input/setup.sh       |  23 +++-
 .../oneliners/check_ft_correctness.sh         |  27 +++++
 .../distr_benchmarks/oneliners/input/setup.sh |  53 ++++++---
 .../oneliners/run.distr.faults.sh             | 106 ++++++++++++++++++
 .../distr_benchmarks/oneliners/run.distr.sh   |   3 +
 10 files changed, 355 insertions(+), 22 deletions(-)
 create mode 100644 evaluation/distr_benchmarks/intro/check-ft-correctness.sh
 create mode 100755 evaluation/distr_benchmarks/intro/demo-spell.sh
 create mode 100644 evaluation/distr_benchmarks/intro/input/.gitignore
 create mode 100644 evaluation/distr_benchmarks/intro/input/setup.sh
 create mode 100644 evaluation/distr_benchmarks/intro/run.distr.faults.sh
 create mode 100644 evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh
 create mode 100644 evaluation/distr_benchmarks/oneliners/run.distr.faults.sh

diff --git a/evaluation/distr_benchmarks/intro/check-ft-correctness.sh b/evaluation/distr_benchmarks/intro/check-ft-correctness.sh
new file mode 100644
index 000000000..81d00d634
--- /dev/null
+++ b/evaluation/distr_benchmarks/intro/check-ft-correctness.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+  
+# Specify the folder where the .out files are located
+folder="$DISH_TOP/evaluation/distr_benchmarks/intro/outputs"
+
+# Loop through the files in the folder
+num_workers=3
+for script_distr_out in "$folder"/*distr.out; do
+    # Extract the script name without the extension
+    script_name=$(basename "$script_distr_out" .distr.out)
+    for ((i = 1; i <= num_workers; i++)); do
+        # get the corresponding .faults.$crashed_worker.out file
+        crashed_worker="worker$i"
+        script_faults_out="$folder/$script_name.faults_$crashed_worker.out"
+
+        # Perform a diff between the two files
+        echo "Comparing faults_$crashed_worker.out and distr.out for script $script_name.sh"
+        if diff -q "$script_faults_out" "$script_distr_out"; then
+            echo "Outputs are identical"
+        else
+            echo "Files are different. Differences are as follows:"
+            diff -y "$script_faults_out" "$script_distr_out"
+        fi
+        echo "-------------------------------------------"
+    done
+    
+done
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/intro/demo-spell.sh b/evaluation/distr_benchmarks/intro/demo-spell.sh
new file mode 100755
index 000000000..2872c353f
--- /dev/null
+++ b/evaluation/distr_benchmarks/intro/demo-spell.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+cd "$(dirname $0)"
+
+[ -z $PASH_TOP ] && { 
+  echo "PASH_TOP not set, maybe $(git rev-parse --show-toplevel)?"
+  exit
+}
+DICT="$DISH_TOP/evaluation/distr_benchmarks/intro/input/sorted_words"
+IN=${IN:-/intro/100M.txt}
+hdfs dfs -cat -ignoreCrc $IN |
+    tr A-Z a-z |
+    tr -cs A-Za-z '\n' |
+    sort |
+    uniq |
+    comm -13 $DICT -
diff --git a/evaluation/distr_benchmarks/intro/input/.gitignore b/evaluation/distr_benchmarks/intro/input/.gitignore
new file mode 100644
index 000000000..f833c1e37
--- /dev/null
+++ b/evaluation/distr_benchmarks/intro/input/.gitignore
@@ -0,0 +1,3 @@
+100M.txt
+words
+sorted_words
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/intro/input/setup.sh b/evaluation/distr_benchmarks/intro/input/setup.sh
new file mode 100644
index 000000000..e914a08d0
--- /dev/null
+++ b/evaluation/distr_benchmarks/intro/input/setup.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
+. "$PASH_TOP/scripts/utils.sh"
+cd $(dirname $0)
+input_files=("100M.txt")
+local_fils=("dict.txt")
+
+[ "$1" = "-c" ] && rm-files 100M.txt words sorted_words
+
+hdfs dfs -mkdir -p /intro
+
+if [ ! -f ./100M.txt ]; then
+  curl -sf --connect-timeout 10 'ndr.md/data/dummy/100M.txt' > 100M.txt
+  if [ $? -ne 0 ]; then
+    # Pipe curl through tac (twice) in order to consume all the output from curl.
+    # This way, curl can write the whole page and not emit an error code.
+    curl -fL 'http://www.gutenberg.org/files/2600/2600-0.txt' | tac | tac | head -c 1M > 1M.txt
+    [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
+    touch 100M.txt
+    for (( i = 0; i < 100; i++ )); do
+      cat 1M.txt >> 100M.txt
+    done
+  fi
+  append_nl_if_not ./100M.txt
+fi
+
+if [ ! -f ./words ]; then
+  curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
+  if [ $? -ne 0 ]; then
+    curl -sf 'https://zenodo.org/record/7650885/files/words' > words
+    if [ $? -ne 0 ]; then
+      if [ $(uname) = 'Darwin' ]; then
+        cp /usr/share/dict/web2 words || eexit "cannot find dict file"
+      else
+        # apt install wamerican-insane
+        cp /usr/share/dict/words words || eexit "cannot find dict file"
+      fi
+    fi
+  fi
+  append_nl_if_not words
+fi
+
+## Re-sort words for this machine
+if [ ! -f ./sorted_words ]; then
+  sort words > sorted_words
+fi
+
+# Add files with different replication factors
+for file in "${input_files[@]}"; do
+    hdfs dfs -put $file /intro/$file
+    rm -f $file
+done
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/intro/run.distr.faults.sh b/evaluation/distr_benchmarks/intro/run.distr.faults.sh
new file mode 100644
index 000000000..e292bf3d5
--- /dev/null
+++ b/evaluation/distr_benchmarks/intro/run.distr.faults.sh
@@ -0,0 +1,66 @@
+PASH_FLAGS='--width 8 --r_split'
+export TIMEFORMAT=%R
+export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt"
+curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict
+
+
+intro_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+  prefix=$prefix
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+
+  touch "$times_file"
+  cat $times_file >> $times_file.d
+  echo executing one-liners with $prefix pash with data $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+
+
+  script="demo-spell"
+
+
+  printf -v pad %30s
+  padded_script="${script}.sh:${pad}"
+  padded_script=${padded_script:0:30}
+
+  outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+  pash_log="${pash_logs_dir}/${script}.pash.log"
+  single_time_file="${outputs_dir}/${script}.${time_suffix}"
+
+  echo -n "${padded_script}" | tee -a "$times_file"
+  { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+  cat "${single_time_file}" | tee -a "$times_file"
+  
+}
+
+intro_faults() {
+  # For faults, mock crash for all workers
+  num_workers=3
+  # it's important to set the timeout long enough for now to avoid the "crashed" worker coming back alive while its replacement does work
+  # until it's fully supported! 
+  timeout=100
+
+  for ((i = 1; i <= num_workers; i++)); do
+    crashed_worker="worker$i"
+    echo Mocking crash for $crashed_worker with timeout of $timeout seconds
+    echo ----------------------------------------------------------------
+    intro_pash "$PASH_FLAGS --distributed_exec --worker_timeout 100 --worker_timeout_choice worker$i" "faults_$crashed_worker"
+    # echo "Iteration $i"
+    # Your loop body here
+  done
+}
+
+outputs_dir="outputs"
+rm -rf "$outputs"
+
+intro_pash "$PASH_FLAGS --distributed_exec" "distr"
+
+intro_faults
diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh
index e523d21a8..380739fc5 100755
--- a/evaluation/distr_benchmarks/nlp/input/setup.sh
+++ b/evaluation/distr_benchmarks/nlp/input/setup.sh
@@ -19,7 +19,7 @@ if [ ! -e ./pg ]; then
   cd pg
   if [[ "$1" == "--full" ]]; then
     echo 'N.b.: download/extraction will take about 10min'
-    wget ndr.md/data/pg.tar.xz
+    wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
     if [ $? -ne 0 ]; then
 		cat <<-'EOF' | sed 's/^ *//'
 		Downloading input dataset failed, thus need to manually rsync all books from  project gutenberg:
@@ -31,10 +31,21 @@ if [ ! -e ./pg ]; then
   cat pg.tar.xz | tar -xJ
   
   else
-    wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip
-    unzip nlp.zip
-    mv data/* .
-    rm nlp.zip data -rf
+    # wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip
+    # unzip nlp.zip
+    # mv data/* .
+    # rm nlp.zip data -rf
+    
+    # Mock 1
+    for (( i = 0; i < 60; i++ )); do
+        touch "$i".txt
+        cat ../genesis >> "$i".txt
+    done
+    # Mock 2
+    for (( i = 61; i < 120; i++ )); do
+        touch "$i".txt
+        cat ../exodus >> "$i".txt
+    done
   fi
 
   for f in *.txt; do
@@ -48,4 +59,4 @@ fi
 hdfs dfs -mkdir /nlp
 hdfs dfs -put exodus /nlp/exodus
 hdfs dfs -put genesis /nlp/genesis
-hdfs dfs -put pg /nlp/pg
+hdfs dfs -put pg /nlp
diff --git a/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh b/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh
new file mode 100644
index 000000000..5d9faa5d6
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+  
+# Specify the folder where the .out files are located
+folder="$DISH_TOP/evaluation/distr_benchmarks/oneliners/outputs"
+
+# Loop through the files in the folder
+num_workers=3
+for script_distr_out in "$folder"/*distr.out; do
+    # Extract the script name without the extension
+    script_name=$(basename "$script_distr_out" .distr.out)
+    for ((i = 1; i <= num_workers; i++)); do
+        # get the corresponding .faults.$crashed_worker.out file
+        crashed_worker="worker$i"
+        script_faults_out="$folder/$script_name.faults_$crashed_worker.out"
+
+        # Perform a diff between the two files
+        echo "Comparing faults_$crashed_worker.out and distr.out for script $script_name.sh"
+        if diff -q "$script_faults_out" "$script_distr_out"; then
+            echo "Outputs are identical"
+        else
+            echo "Files are different. Differences are as follows:"
+            diff -y "$script_faults_out" "$script_distr_out"
+        fi
+        echo "-------------------------------------------"
+    done
+    
+done
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh
index a24725912..c9078d477 100755
--- a/evaluation/distr_benchmarks/oneliners/input/setup.sh
+++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
 #set -e
 
-PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
+PASH_TOP=${PASH_TOP:-$DISH_TOP/pash}
+. "$PASH_TOP/scripts/utils.sh"
+
 
 # another solution for capturing HTTP status code
 # https://superuser.com/a/590170
@@ -13,14 +15,15 @@ if [[ "$1" == "-c" ]]; then
     exit
 fi
 
-hdfs dfs -mkdir /oneliners
+hdfs dfs -mkdir -p /oneliners
 
 if [ ! -f ./1M.txt ]; then
-    curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
+    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
     if [ $? -ne 0 ]; then
-        echo 'cannot find 1M.txt -- please contact the developers of pash'
-        exit 1
+        curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt
+        [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
     fi
+    append_nl_if_not ./1M.txt
 fi
 
 if [ ! -f ./10M.txt ]; then
@@ -38,35 +41,53 @@ if [ ! -f ./100M.txt ]; then
 fi
 
 if [ ! -f ./1G.txt ]; then
-    curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
+    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
+    if [ $? -ne 0 ]; then
+        touch 1G.txt
+        for (( i = 0; i < 10; i++ )); do
+            cat 100M.txt >> 1G.txt
+        done
+    fi
+fi
+
+if [ ! -f ./words ]; then
+  curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
+  if [ $? -ne 0 ]; then
+    curl -f 'https://zenodo.org/record/7650885/files/words' > words
     if [ $? -ne 0 ]; then
-        echo 'cannot find 1G.txt -- please contact the developers of pash'
-        exit 1
+      if [ $(uname) = 'Darwin' ]; then
+        cp /usr/share/dict/web2 words || eexit "cannot find dict file"
+      else
+        # apt install wamerican-insane
+        cp /usr/share/dict/words words || eexit "cannot find dict file"
+      fi
     fi
+  fi
+  append_nl_if_not words
 fi
 
 # download wamerican-insane dictionary and sort according to machine
 if [ ! -f ./dict.txt ]; then
-    curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
+    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
     if [ $? -ne 0 ]; then
-        echo 'cannot find dict.txt -- please contact the developers of pash'
-        exit 1
+        sort words > sorted_words
     fi
 fi
 
 if [ ! -f ./all_cmds.txt ]; then
-    curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
+    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
     if [ $? -ne 0 ]; then
         # This should be OK for tests, no need for abort
         ls /usr/bin/* > all_cmds.txt
     fi
+    append_nl_if_not ./all_cmds.txt
 fi
 
 if [ ! -f ./all_cmdsx100.txt ]; then
-        touch all_cmdsx100.txt
-        for (( i = 0; i < 100; i++ )); do
-            cat all_cmds.txt >> all_cmdsx100.txt
-        done
+    touch all_cmdsx100.txt
+    for (( i = 0; i < 100; i++ )); do
+        cat all_cmds.txt >> all_cmdsx100.txt
+    done
 fi
 
 if [ ! -f ./3G.txt ]; then
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.faults.sh b/evaluation/distr_benchmarks/oneliners/run.distr.faults.sh
new file mode 100644
index 000000000..17bbbcdd4
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.faults.sh
@@ -0,0 +1,106 @@
+PASH_FLAGS='--width 8 --r_split'
+export TIMEFORMAT=%R
+export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt"
+curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict
+
+
+scripts_inputs=(
+      "nfa-regex;1G.txt"
+      "sort;3G.txt"
+      "top-n;3G.txt"
+      "wf;3G.txt"
+      "spell;3G.txt"
+      "diff;3G.txt"
+      "bi-grams;3G.txt"
+      "set-diff;3G.txt"
+      "sort-sort;3G.txt"
+      "shortest-scripts;all_cmdsx100.txt"
+  )
+
+# scripts_num_subgraphs=(
+#     "nfa-regex;1"
+#     "sort;1"
+#     "top-n;1"
+#     "wf;1"
+#     "spell;1"
+#     "diff;2"
+#     "bi-grams;1"
+#     "set-diff;2"
+#     "sort-sort;1"
+#     "shortest-scripts;1"
+# )
+# declare -A num_subgraphs_map
+
+# # Populate the associative array
+# for num_subgraph in "${scripts_num_subgraphs[@]}"; do
+#   IFS=";" read -r -a subgraph_info <<< "$num_subgraph"
+#   script_name="${subgraph_info[0]}"
+#   num_subgraphs="${subgraph_info[1]}"
+#   num_subgraphs_map["$script_name"]=$num_subgraphs
+# done
+
+oneliners_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+  prefix=$prefix
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+
+  touch "$times_file"
+  cat $times_file >> $times_file.d
+  echo executing one-liners with $prefix pash with data $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+
+  for script_input in ${scripts_inputs[@]}
+  do
+    IFS=";" read -r -a script_input_parsed <<< "${script_input}"
+    script="${script_input_parsed[0]}"
+    input="${script_input_parsed[1]}"
+
+    export IN="/oneliners/$input"
+    export dict=
+
+    printf -v pad %30s
+    padded_script="${script}.sh:${pad}"
+    padded_script=${padded_script:0:30}
+
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+    pash_log="${pash_logs_dir}/${script}.pash.log"
+    single_time_file="${outputs_dir}/${script}.${time_suffix}"
+
+    echo -n "${padded_script}" | tee -a "$times_file"
+    { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+    cat "${single_time_file}" | tee -a "$times_file"
+  done
+}
+
+oneliners_faults() {
+  # For faults, mock crash for all workers
+  num_workers=3
+  # it's important to set the timeout long enough for now to avoid the "crashed" worker coming back alive while its replacement does work
+  # until it's fully supported! 
+  timeout=100
+
+  for ((i = 1; i <= num_workers; i++)); do
+    crashed_worker="worker$i"
+    echo Mocking crash for $crashed_worker with timeout of $timeout seconds
+    echo ----------------------------------------------------------------
+    oneliners_pash "$PASH_FLAGS --distributed_exec --worker_timeout 100 --worker_timeout_choice worker$i" "faults_$crashed_worker"
+    # echo "Iteration $i"
+    # Your loop body here
+  done
+}
+
+outputs_dir="outputs"
+rm -rf "$outputs"
+
+oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"
+
+oneliners_faults
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
index 680c31797..83ac0d555 100755
--- a/evaluation/distr_benchmarks/oneliners/run.distr.sh
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -120,6 +120,9 @@ oneliners_hadoopstreaming(){
   mv "hadoop-streaming/$times_file" .
 }
 
+outputs_dir="outputs"
+rm -rf "$outputs"
+
 oneliners_bash
 
 oneliners_pash "$PASH_FLAGS" "par"