UQ-PAC
diff --git a/‎.github/workflows/check-test-tagging.sh
+33 b/‎.github/workflows/check-test-tagging.sh
+33
diff --git a/‎.github/workflows/run-examples.yml
+20-48 b/‎.github/workflows/run-examples.yml
+20-48
diff --git a/‎docs/development/testing.md
+148 b/‎docs/development/testing.md
+148
diff --git a/‎scripts/scalatest.sh
+33 b/‎scripts/scalatest.sh
+33
diff --git a/‎src/test/scala/BitVectorAnalysisTests.scala
+1 b/‎src/test/scala/BitVectorAnalysisTests.scala
+1
diff --git a/‎src/test/scala/DataStructureAnalysisTest.scala
+1 b/‎src/test/scala/DataStructureAnalysisTest.scala
+1
diff --git a/‎src/test/scala/DifferentialAnalysisTest.scala
+24-1 b/‎src/test/scala/DifferentialAnalysisTest.scala
+24-1
@@ -0,0 +1,33 @@
+#!/bin/bash -eu
+
+set -o pipefail
+
+test_dir=src/test
+
+echo '::group::All test suites:'
+tests="$(./mill test.testOnly -- -t '' -oW | tr -d ':' | sort)"
+echo "$tests"
+echo '::endgroup::'
+echo
+echo '::group::Disabled test suites:'
+grep --color=always '@test_util\.tags\.DisabledTest' --context=1 -R $test_dir
+echo '::endgroup::'
+echo
+
+ok=true
+echo '::group::Test suites with no tag annotations:'
+for t in $tests; do
+  defn="$(grep 'class\s\+'"$t"'[ (]' --before-context=2 -R $test_dir)"
+  # search for lines which are entirely "@test_util.tags.*Test"
+  # leading - is produced by grep to mark prefixes
+  if ! grep -q -- '-@test_util\.tags\..\+Test$' <<< "$defn"; then
+    echo 'test suite has no `@test_util.tags.*Test` annotation:' >&2
+    grep --color=always 'class\s\+'"$t"'[ (]' --before-context=2  -R $test_dir
+    echo
+    ok=false
+  fi
+done
+echo '::endgroup::'
+
+$ok
+
@@ -8,6 +8,18 @@ on:
       - main
   workflow_dispatch:
 jobs:
+  CheckTestTagging:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '21'
+      - run: './mill test.compile'
+      - name: Check all test suites have at least one tag
+        run: .github/workflows/check-test-tagging.sh
+
   Scalafmt:
     runs-on: ubuntu-latest
     steps:
@@ -52,8 +64,9 @@ jobs:
       - run: sudo apt-get install -y z3='4.8.12-*'
       - run: dotnet tool install --global boogie --version '3.4.3'
 
-      - name: System Tests
-        run: ./mill test.testOnly 'SystemTests*'
+      - run: ./mill compile
+      - run: ./scripts/scalatest.sh -oID -n test_util.tags.StandardSystemTest
+
       - uses: actions/upload-artifact@v4
         with:
           name: testresult-${{ github.run_number }}
@@ -103,44 +116,11 @@ jobs:
       - run: dotnet tool install --global boogie --version '3.4.3'
 
       - run: ./mill test.compile
+      - run: ./scripts/scalatest.sh -oID -n test_util.tags.UnitTest
 
         # every test with package prefix:
         # sbt "show test:definedTests"
 
-        # note: tests prefixed with '!' are expected to fail.
-        #       if they are fixed, the '!' should be removed.
-
-      - run: ./mill test.testOnly IrreducibleLoop
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly BitVectorAnalysisTests
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly '*IntrusiveListPublicInterfaceTest'
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly util.intrusive_list.IntrusiveListTest
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly DataStructureAnalysisTest
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly ParamAnalysisTests
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly LiveVarsAnalysisTests
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly ir.ToScalaTest
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly ir.IRTest
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly ir.IRToDSLTest
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly ir.CILVisitorTest
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly ir.InterpreterTests
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly ir.InvariantTest
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly ProcedureSummaryTests || true
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly TaintAnalysisTests
-        if: ${{ ! cancelled() }}
-
   AnalysisSystemTests:
     runs-on: ubuntu-latest
 
@@ -161,15 +141,7 @@ jobs:
       - run: sudo apt-get install -y z3='4.8.12-*'
       - run: dotnet tool install --global boogie --version '3.4.3'
 
-      - run: echo "All systemtest suites:" & ./mill test.testOnly '*SystemTests*' -- -z 'xxxx'
-
-      - run: ./mill test.testOnly DSAMemoryRegionSystemTestsBAP
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly DSAMemoryRegionSystemTestsGTIRB
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly AnalysisSystemTestsBAP
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly AnalysisSystemTestsGTIRB
-        if: ${{ ! cancelled() }}
-      - run: ./mill test.testOnly SimplifySystemTests
-        if: ${{ ! cancelled() }}
+      - run: echo "All systemtest suites:" && ./mill test.testOnly '*SystemTests*' -- -z 'xxxx'
+
+      - run: ./mill compile
+      - run: ./scripts/scalatest.sh -oID -n test_util.tags.AnalysisSystemTest
@@ -0,0 +1,148 @@
+Testing
+=======
+
+Testing is crucial for validating new features and preventing regressions.
+It is important that tests are written for any new features, as well as any
+identified bugs.
+It is important that we should have a clear understanding of what each test
+is testing, whether it currently passes and, if not, why it is currently
+not working.
+This page will describe some tools in the codebase which help with developing
+and maintaining test cases.
+
+Writing test cases
+------------------
+Test cases are written in Scalatest using its
+[`AnyFunSuite`](https://www.scalatest.org/scaladoc/3.1.2/org/scalatest/funsuite/AnyFunSuite.html) style.
+See the AnyFunSuite documentation or existing test cases for syntax and examples.
+
+### Exporting IR structures into test cases
+
+Often, you might have found a particular Basil IR program which demonstrates some bug in the code.
+It is good practice to extract this into a test case, both to validate the fix and ensure the bug doesn't reoccur.
+To help with this, a Basil IR program can be converted to
+a Scala literal by using the [ToScala](https://github.com/UQ-PAC/BASIL/blob/main/src/main/scala/ir/dsl/ToScala.scala)
+trait.
+
+To do this, first `import ir.dsl.given`, then you can use the `.toScala` extension method on programs, procedures, or blocks.
+This gives you a string which is valid Scala code.
+This can be copied and pasted into a unit test.
+When executed, the Scala code to will re-construct that Basil IR structure using the DSL.
+
+### Tagging test suites
+
+[Tags](https://www.scalatest.org/scaladoc/3.2.1/org/scalatest/Tag.html)
+are used to categorise test classes based on, roughly, the kind of test (e.g., unit tests or system (end-to-end) tests).
+Each test suite should be tagged with one of the
+[`@test_util.tags.*Test`](https://github.com/UQ-PAC/BASIL/tree/main/src/test/scala/test_util/tags) tags,
+placed on the line before the AnyFunSuite class declaration.
+A test suite may, additionally, be tagged with one or more of the supplementary tags (those not ending in Test).
+
+To run only tests from a specific tag, you can use
+```bash
+./scripts/scalatest.sh -o -n test_utils.tags.TagName
+```
+Note that the tag name must be fully-qualified (i.e., including the package name).
+See [the Scalatest runner docs](https://www.scalatest.org/user_guide/using_the_runner) or the `scalatest.sh` file
+for more options.
+
+### Dynamic tests
+
+Note that the `test("test name")` method can be written anywhere within a AnyFunSuite body, including
+within loops or conditionals.
+This allows you to dynamically generate test cases, as in
+Basil's [`SystemTests`](/src/test/scala/SystemTests.scala).
+This should be used sparingly.
+
+
+Maintaining test cases
+----------------------
+Over time, test cases might break due to code changes or refactoring.
+Of course, failing tests should be fixed as soon as possible.
+However, this is not always possible - maybe a test case relies on features not yet implemented.
+It might be reasonable to allow a test to fail for a period of time until the fixes are ready.
+In these cases, it is important that tests which are known/expected to fail are clearly marked
+and the reason for their failure should be recorded in the code.
+
+Generally, the strategy is that failing tests should still be executed.
+They should be annotated so that they are allowed to fail, but if they start passing,
+that should raise an error until the annotation is removed.
+This allows the test code to be an accurate record of the expected outcome of each test.
+
+
+### pendingUntilFixed (for expected failures)
+
+[`pendingUntilFixed`](https://www.scalatest.org/scaladoc/3.2.3/org/scalatest/Assertions.html#pendingUntilFixed(f:=%3EUnit)(implicitpos:org.scalactic.source.Position):org.scalatest.Assertionwithorg.scalatest.PendingStatement) should be used to mark a block of code (typically containing an assertion) as one which is currently expected to fail. This should be used to record test cases which fail due to not-yet-implemented features or known bugs. If a change is made and the code no longer fails, this will cause the test to fail until the pendingUntilFixed is removed.
+
+It should be used within a `test() { ... }` block like so, with a comment documenting the cause of failure and expected future resolution.
+```scala
+test("1 == 2 sometime soon?") {
+  assert(1 == 1, "obviously")
+
+  // broken until we fix maths
+  pendingUntilFixed {
+    assert(1 == 2, "todo")
+  }
+}
+```
+
+- Tests can be marked as ignored by replacing `test()` with `ignore()`. An entire suite can be marked as ignored with the `@org.scalatest.Ignore` annotation.
+
+### TestCustomisation (for dynamically-generated tests)
+
+For some tests, particularly those which are dynamically-generated by a loop, it is not practical to add a `pendingUntilFixed` block
+into the test body.
+For these cases, there is a `trait TestCustomisation` to help with customising dynamically-generated tests
+based on their test case name (for system tests, this includes the file path and compiler/lifter options).
+
+To use this, the test suite class should be made to extend [TestCustomisation](/src/test/scala/test_util/TestCustomisation.scala).
+This defines an abstract method customiseTestsByName which controls the mode of each test case.
+```scala
+@test_util.tags.UnitTest
+class ProcedureSummaryTests extends AnyFunSuite, TestCustomisation {
+
+  override def customiseTestsByName(name: String) = {
+    name match {
+      case "test a" => Mode.NotImplemented("doesn't seem to work yet")
+      case _ => Mode.Normal
+    }
+  }
+
+  test("test a") {
+    assert(false);
+  }
+
+  test("test b") {
+    assert(true);
+  }
+}
+```
+Test cases can be marked as retry, disabled, not implemented, or temporary failure
+(see TestCustomisation source file for more details).
+This modifies the behaviour of the test case and prints helpful output when running the test. For example:
+```c
+- correct/malloc_with_local3/clang:BAP (pending)
+  + NOTE: Test case is customised with: "ExpectFailure(previous failure was: Expected verification success, but got failure. Failing assertion is: assert (load37_1 == R30_in))"
+
+  + Failing assertion ./src/test/correct/malloc_with_local3/clang/malloc_with_local3_bap.bpl:264
+  + 261 |     load36_1, Gamma_load36_1 := memory_load64_le(mem_9, bvadd64(R31_in, 18446744073709551600bv64)), (gamma_load64(Gamma_mem_9, bvadd64(R31_in, 18446744073709551600bv64)) || L(bvadd64(R31_in, 18446744073709551600bv64)));
+    262 |     call rely();
+    263 |     load37_1, Gamma_load37_1 := memory_load64_le(mem_10, bvadd64(R31_in, 18446744073709551608bv64)), (gamma_load64(Gamma_mem_10, bvadd64(R31_in, 18446744073709551608bv64)) || L(bvadd64(R31_in, 18446744073709551608bv64)));
+ >  264 |     assert (load37_1 == R30_in); //is returning to caller-set R30
+    265 |     goto printCharValue_2260_basil_return;
+    266 |   printCharValue_2260_basil_return:
+    267 |     assume {:captureState "printCharValue_2260_basil_return"} true;
+```
+
+```c
+- analysis_differential:malloc_with_local/gcc_O2:GTIRB (pending)
+  + NOTE: Test case is customised with: "ExpectFailure(needs printf_chk)"
+
+  + STDOUT: ""
+```
+
+Modes which expect failure (temp failure and not implemented) will show as "pending" when
+executing scalatest.
+The disabled mode will show as "cancelled".
+Both of these will be output in yellow text if your console is using colour.
+
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Executes the full Scalatest runner (https://www.scalatest.org/user_guide/using_the_runner).
+#
+# This is needed to use some more advanced arguments of the runner, since `./mill test.run`
+# silently ignores some arguments (for example, -n and -l).
+
+# NOTE: executing the runner through this script may try to start a GUI.
+#       to avoid this and use the console, pass -o.
+
+# useful commands
+# ---------------
+#
+# - to include only a specific tag, use -n.
+# - to exclude a specific tag, use -l.
+# - these can be combined. for example, to select all StandardSystemTest except Slow ones:
+#
+#     ./scripts/scalatest.sh -o -n test_util.tags.StandardSystemTest -l test_util.tags.Slow
+#
+# - when passing tags through the command line, the fully-qualified name must be used.
+# - to print individual test durations, pass D to -o. for example, -oD.
+# - to print a summary of failing tests at the end of the output, pass I to -o.
+#
+
+classes="$(./mill show test.compile | grep '"classes"' | cut -d'"' -f4 | cut -d: -f4)"
+
+if ! [[ -d "$classes" ]]; then
+  echo "unable to determine mill class output directory: $classes" >&2
+  exit 1
+fi
+
+exec ./mill test.runMain org.scalatest.tools.Runner -R "$classes" "$@"
+
@@ -5,6 +5,7 @@ import util.Logger
 
 import scala.runtime.stdLibPatches.Predef.assert
 
+@test_util.tags.UnitTest
 class BitVectorAnalysisTests extends AnyFunSuite {
 
   test("BitVector to Natural - should convert BitVector to natural number") {
 
@@ -27,6 +27,7 @@ import translating.PrettyPrinter.*
   * BASILRESULT.analysis.get.bu is the set of graphs from the end of the bottom-up phase BASILRESULT.analysis.get.td is
   * the set of graphs from the end of the top-down phase
   */
+@test_util.tags.UnitTest
 class DataStructureAnalysisTest extends AnyFunSuite {
 
   def runAnalysis(program: Program): StaticAnalysisContext = {
 
@@ -30,7 +30,28 @@ import util.RunUtils.loadAndTranslate
 
 import scala.collection.mutable
 
-class DifferentialTest extends AnyFunSuite {
+abstract class DifferentialTest extends AnyFunSuite, TestCustomisation {
+
+  override def customiseTestsByName(name: String) = name match {
+    case "analysis_differential:floatingpoint/clang:GTIRB" | "analysis_differential:floatingpoint/gcc:GTIRB" =>
+      Mode.NotImplemented("needs FP_Mul")
+
+    case "analysis_differential:function1/gcc_O2:BAP" | "analysis_differential:function1/gcc_O2:GTIRB" |
+        "analysis_differential:malloc_with_local/gcc_O2:BAP" | "analysis_differential:malloc_with_local/gcc_O2:GTIRB" |
+        "analysis_differential:malloc_with_local3/gcc_O2:BAP" |
+        "analysis_differential:malloc_with_local3/gcc_O2:GTIRB" =>
+      Mode.NotImplemented("needs printf_chk")
+
+    case "analysis_differential:syscall/clang:BAP" | "analysis_differential:syscall/clang:GTIRB" |
+        "analysis_differential:syscall/clang_O2:GTIRB" | "analysis_differential:syscall/gcc:BAP" |
+        "analysis_differential:syscall/gcc:GTIRB" =>
+      Mode.NotImplemented("needs fork")
+
+    case "analysis_differential:syscall/gcc_O2:BAP" => Mode.TempFailure("traceInit empty")
+    case "analysis_differential:syscall/gcc_O2:GTIRB" => Mode.NotImplemented("needs fork")
+
+    case _ => Mode.Normal
+  }
 
   Logger.setLevel(LogLevel.WARN)
 
@@ -114,6 +135,7 @@ class DifferentialTest extends AnyFunSuite {
   }
 }
 
+@test_util.tags.AnalysisSystemTest
 class DifferentialAnalysisTest extends DifferentialTest {
 
   def runSystemTests(): Unit = {
@@ -147,6 +169,7 @@ class DifferentialAnalysisTest extends DifferentialTest {
   runSystemTests()
 }
 
+@test_util.tags.AnalysisSystemTest
 class DifferentialAnalysisTestSimplification extends DifferentialTest {
 
   def runSystemTests(): Unit = {