diff --git a/.devcontainer.json b/.devcontainer.json new file mode 100644 index 0000000..2070d67 --- /dev/null +++ b/.devcontainer.json @@ -0,0 +1,16 @@ +{ + "image": "mcr.microsoft.com/devcontainers/base:alpine", + "features": { + "ghcr.io/devcontainers/features/nix:1": { + "extraNixConfig": "experimental-features = nix-command flakes" + } + }, + "onCreateCommand": "nix shell nixpkgs#gitMinimal -c nix run nixpkgs#home-manager -- switch --flake git+https://code.europa.eu/ecphp/devs-profile#light --impure", + "customizations": { + "vscode": { + "settings": { + "terminal.integrated.defaultProfile.linux": "fish" + } + } + } + } diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..1e03529 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,16 @@ +root = true + +[*] +indent_size = 4 +charset = utf-8 +end_of_line = lf +trim_trailing_whitespace = true + +[Makefile] +indent_style = tab + +[*.{tex,cls,lua,nix,typ,md}] +trim_trailing_whitespace = false +indent_style = space +indent_size = 2 +max_line_length = 80 diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..a5dbbcb --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake . diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..529c034 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @drupol diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..70181ab --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +version: 2 +updates: + - package-ecosystem: github-actions + directory: "/" + schedule: + interval: monthly + open-pull-requests-limit: 10 diff --git a/.github/settings.yml b/.github/settings.yml new file mode 100644 index 0000000..8bc2fb7 --- /dev/null +++ b/.github/settings.yml @@ -0,0 +1,55 @@ +# https://github.com/probot/settings + +branches: + - name: main + protection: + enforce_admins: false + required_pull_request_reviews: + dismiss_stale_reviews: true + require_code_owner_reviews: true + required_approving_review_count: 1 + restrictions: null + required_linear_history: true + - name: update_flake_lock_action + protection: + enforce_admins: false + required_pull_request_reviews: + dismiss_stale_reviews: true + require_code_owner_reviews: false + restrictions: null + required_linear_history: true + +labels: + - name: bug + color: ee0701 + + - name: dependencies + color: 0366d6 + + - name: enhancement + color: 0e8a16 + + - name: question + color: cc317c + + - name: security + color: ee0701 + + - name: stale + color: eeeeee + +repository: + allow_merge_commit: true + allow_rebase_merge: true + allow_squash_merge: true + default_branch: main + description: "Pol Dellaiera's Master Thesis - Reproducibility In Software Engineering" + homepage: https://github.com/drupol/master-thesis + topics: master-thesis,umons + has_downloads: true + has_issues: true + has_pages: false + has_projects: false + has_wiki: false + name: master-thesis + private: true diff --git a/.github/workflows/build-main.yml b/.github/workflows/build-main.yml new file mode 100644 index 0000000..978cd7d --- /dev/null +++ b/.github/workflows/build-main.yml @@ -0,0 +1,84 @@ +name: Release + +on: + push: + branches: + - main + paths: + - "src/**" + - "resources/**" + - "**.nix" + - ".github/workflows/*.yml" + +jobs: + dependencies: + name: Build dependencies + runs-on: ubuntu-latest + outputs: + branch: ${{ steps.extract_branch.outputs.branch }} + + steps: + - name: Check out source files + uses: actions/checkout@v4 + + - name: Extract branch name + shell: bash + run: echo "branch=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_OUTPUT + id: extract_branch + + build: + name: Build PDF files + runs-on: ubuntu-latest + needs: [dependencies] + + steps: + - name: Set git to use LF + run: | + git config --global core.autocrlf false + git config --global core.eol lf + + - name: Check out source files + uses: actions/checkout@v4 + + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@main + + - name: Build document + run: | + mkdir -p output + nix build .#thesis --out-link result-thesis --quiet + cp -vr --dereference $(readlink -f result-thesis) thesis + cp -ar thesis/* output/ + + - name: Upload build assets + uses: actions/upload-artifact@v4 + with: + name: artefacts + path: output + + assets: + name: Create release + runs-on: ubuntu-latest + needs: [dependencies, build] + + steps: + - name: Download build assets (${{ matrix.assets.input }}) + uses: actions/download-artifact@v4 + + - name: Rename files + working-directory: artefacts + run: | + for f in *.pdf; do cp ${f} ../$(printf '%s\n' "${{ github.run_number }}--${f%.pdf}--${{ github.sha }}.pdf"); done + + - name: Create pre-release (v${{ github.run_number }}-${{ github.sha }}) + id: create_release + uses: softprops/action-gh-release@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: v${{ github.run_number }}-${{ github.sha }} + name: Release ${{ github.run_number }} (${{ github.sha }}) + draft: false + prerelease: true + files: | + ./*.pdf diff --git a/.github/workflows/build-pr.yml b/.github/workflows/build-pr.yml new file mode 100644 index 0000000..b27671e --- /dev/null +++ b/.github/workflows/build-pr.yml @@ -0,0 +1,46 @@ +name: Build PR + +on: + push: + +jobs: + build: + name: Build PDF files + runs-on: ubuntu-latest + if: github.ref != 'refs/heads/main' + + steps: + - name: Set git to use LF + run: | + git config --global core.autocrlf false + git config --global core.eol lf + + - name: Check out source files + uses: actions/checkout@v4 + + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@main + + - name: Extract branch name + shell: bash + run: echo "branch=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_OUTPUT + id: extract_branch + + - name: Build document + run: | + nix build .#thesis --out-link result-thesis --quiet + cp -vr --dereference $(readlink -f result-thesis) thesis + mkdir -p artefacts + cp -ar thesis/* artefacts/ + + - name: Rename files + working-directory: artefacts + run: | + for f in *.pdf; do mv ${f} $(printf '%s\n' "${{ github.run_number }}--${f%.pdf}--${{ github.sha }}.pdf"); done + + - name: Upload build assets + uses: actions/upload-artifact@v4 + with: + name: pdf--branch-${{ steps.extract_branch.outputs.branch }}--${{ github.sha }} + path: artefacts + if-no-files-found: error diff --git a/.github/workflows/comment-pr.yml b/.github/workflows/comment-pr.yml new file mode 100644 index 0000000..cc01972 --- /dev/null +++ b/.github/workflows/comment-pr.yml @@ -0,0 +1,78 @@ +--- +name: Comment Artifact URL on PR + +on: + workflow_run: + types: + - "completed" + workflows: + - "Build PR" + +jobs: + comment: + if: github.event.workflow_run.conclusion == 'success' + runs-on: ubuntu-latest + steps: + - name: Get Artifact and Pull request info + env: + GITHUB_TOKEN: ${{ github.token }} + WORKFLOW_RUN_EVENT_OBJ: ${{ toJSON(github.event.workflow_run) }} + OWNER: ${{ github.repository_owner }} + REPO: ${{ github.event.repository.name }} + run: | + PREVIOUS_JOB_ID=$(jq -r '.id' <<< "$WORKFLOW_RUN_EVENT_OBJ") + echo "Previous Job ID: $PREVIOUS_JOB_ID" + echo "PREVIOUS_JOB_ID=$PREVIOUS_JOB_ID" >> "$GITHUB_ENV" + + SUITE_ID=$(jq -r '.check_suite_id' <<< "$WORKFLOW_RUN_EVENT_OBJ") + echo "Previous Suite ID: $SUITE_ID" + echo "SUITE_ID=$SUITE_ID" >> "$GITHUB_ENV" + + ARTIFACT_ID=$(gh api "/repos/$OWNER/$REPO/actions/artifacts" \ + --jq ".artifacts.[] | + select(.workflow_run.id==${PREVIOUS_JOB_ID}) | + select(.expired==false) | + .id") + + echo "Artifact ID: $ARTIFACT_ID" + echo "ARTIFACT_ID=$ARTIFACT_ID" >> "$GITHUB_ENV" + + PR_NUMBER=$(jq -r '.pull_requests[0].number' \ + <<< "$WORKFLOW_RUN_EVENT_OBJ") + + echo "Pull request Number: $PR_NUMBER" + echo "PR_NUMBER=$PR_NUMBER" >> "$GITHUB_ENV" + + HEAD_SHA=$(jq -r '.pull_requests[0].head.sha' \ + <<< "$WORKFLOW_RUN_EVENT_OBJ") + + echo "Head SHA: $HEAD_SHA" + echo "HEAD_SHA=$HEAD_SHA" >> "$GITHUB_ENV" + - name: Find Comment + uses: peter-evans/find-comment@v3 + id: find-comment + with: + issue-number: ${{ env.PR_NUMBER }} + comment-author: "github-actions[bot]" + - name: Update Comment + env: + JOB_PATH: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ env.PREVIOUS_JOB_ID }}" + ARTIFACT_URL: "${{ github.server_url }}/${{ github.repository }}/suites/${{ env.SUITE_ID }}/artifacts/${{ env.ARTIFACT_ID }}" + HEAD_SHA: "${{ env.HEAD_SHA }}" + uses: peter-evans/create-or-update-comment@v4 + with: + issue-number: ${{ env.PR_NUMBER }} + comment-id: ${{ steps.find-comment.outputs.comment-id }} + edit-mode: replace + body: |- + ![badge] + + Build Successful! You can find a link to the downloadable artifact below. + + | Name | Link | + | -------- | ----------------------- | + | Commit | ${{ env.HEAD_SHA }} | + | Logs | ${{ env.JOB_PATH }} | + | Download | ${{ env.ARTIFACT_URL }} | + + [badge]: https://img.shields.io/badge/Build_Success!-0d1117?style=for-the-badge&labelColor=3fb950&logo= diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2e68560 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +/.cache/ +/.devenv/ +/.direnv/ +/build/ +/.vscode/ +/out/ +*.bak* +indent.log +/result +/*.pdf +/*.pdfpc +/src/**/*.pdf +lib/**/output +lib/**/result +lib/**/*.pdf +/trunk/ +cert.pem +private-key.pem diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..c47d4a1 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,6 @@ +/.direnv/ +/.idea/ +/vendor/ +/docs/ +/build/ +CHANGELOG.md diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..4b1ce6b --- /dev/null +++ b/.prettierrc @@ -0,0 +1,3 @@ +{ + "proseWrap": "always" +} diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..d8af077 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,105 @@ +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + +cff-version: 1.2.0 +title: Reproducibility in Software Engineering +message: >- + If you use this software, please cite it using the + metadata from this file. +authors: + - given-names: Pol + family-names: Dellaiera + email: pol.dellaiera@protonmail.com + orcid: 'https://orcid.org/0009-0008-7972-7160' + affiliation: UMons +repository-code: 'https://github.com/drupol/master-thesis' +doi: 10.5281/zenodo.1234 +date-released: 2024-07-01 +url: "https://github.com/drupol/master-thesis" +abstract: >- + The concept of reproducibility has long been a cornerstone + in scientific + + research, ensuring that results are robust, repeatable, + and can be independently + + verified. This concept has been extended to computer + science, focusing on the + + ability to recreate identical software artefacts. However, + the importance of + + reproducibility in software engineering is often + overlooked, leading to + + challenges in the validation, security, and reliability of + software products. + + + This master's thesis aims to investigate the current state + of reproducibility in + + software engineering, exploring both the barriers and + potential solutions to + + making software more reproducible and raising awareness. + It identifies key + + factors that impede reproducibility such as inconsistent + environments, lack of + + standardisation, and incomplete documentation. To tackle + these issues, I propose + + an empirical comparison of tools facilitating software + reproducibility. + + + To provide a comprehensive assessment of reproducibility + in software + + engineering, this study adopts a methodology that involves + a hands-on evaluation + + of four different methods and tools. Through a systematic + evaluation of these + + tools, this research seeks to determine their + effectiveness in establishing and + + maintaining identical software environments and builds. + + + This study contributes to academic knowledge and offers + practical insights that + + could influence future software development protocols and + standards. +keywords: + - reproducibility + - software engineering +license: CC-BY-4.0 +references: + - title: Reproducibility in Software Engineering + authors: + - family-names: Dellaiera + given-names: Pol + year: 2024 + type: thesis + thesis-type: Master's thesis + url: >- + https://not-a-number.io/master-thesis.pdf + institution: + name: University Of Mons +preferred-citation: + type: masterthesis + authors: + - family-names: "Dellaiera" + given-names: "Pol" + orcid: "https://orcid.org/0009-0008-7972-7160" + doi: "10.0000/00000" + month: 6 + start: 1 # First page number + end: 90 # Last page number + title: "Reproducibility in Software Engineering" + year: 2024 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4a859b6 --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +CC-BY-4.0 AND HL3 diff --git a/README.md b/README.md new file mode 100644 index 0000000..28c95cb --- /dev/null +++ b/README.md @@ -0,0 +1,94 @@ +[![Download latest](https://img.shields.io/badge/Download-Latest-brightgreen?style=flat-square)](https://github.com/drupol/master-thesis/releases/latest) +[![CC BY License badge](https://img.shields.io/badge/License-CC--BY--4.0-brightgreen?style=flat-square)](https://creativecommons.org/licenses/by/4.0/) +[![HL3 License badge](https://img.shields.io/badge/License-HL3--full-brightgreen?style=flat-square)](https://firstdonoharm.dev/version/3/0/full.html) + +# UMons Master Thesis - Reproducibility in Software Engineering (RiSE) + +This repository contains the work done during my master thesis at the +[University Of Mons] under the supervision of [Tom +Mens][tom mens university website]. + +## Abstract + +The concept of reproducibility has long been a cornerstone in scientific +research, ensuring that results are robust, repeatable, and can be independently +verified. This concept has been extended to computer science, focusing on the +ability to recreate identical software artefacts. However, the importance of +reproducibility in software engineering is often overlooked, leading to +challenges in the validation, security, and reliability of software products. + +This master's thesis aims to investigate the current state of reproducibility in +software engineering, exploring both the barriers and potential solutions to +making software more reproducible and raising awareness. It identifies key +factors that impede reproducibility such as inconsistent environments, lack of +standardisation, and incomplete documentation. To tackle these issues, I propose +an empirical comparison of tools facilitating software reproducibility. + +To provide a comprehensive assessment of reproducibility in software +engineering, this study adopts a methodology that involves a hands-on evaluation +of four different methods and tools. Through a systematic evaluation of these +tools, this research seeks to determine their effectiveness in establishing and +maintaining identical software environments and builds. + +This study contributes to academic knowledge and offers practical insights that +could influence future software development protocols and standards. + +## Repository Structure + +The repository is structured as follows: + +- `src/thesis`: Contains the [Typst] source code +- `nix`: Contains the [Nix] expressions necessary for the build but also for the + local development environment +- `resources`: Contains some resources (images, source code, ...) + +## Licensing + +This work is licenced under a dual license: the Creative Commons Attribution 4.0 +International ([CC BY 4.0]) and the Hippocratic Licence 3.0 ([HL3]) licences. +You are free to share and adapt the material under the terms of the CC BY 4.0, +provided you give appropriate credit to the original author. You must also use +the material in accordance with the ethical guidelines specified in HL3, +ensuring it is not used to contribute to human rights abuses or other unethical +practices. In case of any conflict between the licences, HL3 will take +precedence. + +## Cite + +``` +@masterthesis{dellaieraMasterThesis2024, + title = {Reproducibility in Software Engineering}, + author = {Dellaiera, Pol}, + year = 2024, + month = {June}, + note = {Available at \url{TODO}}, + school = {University of Mons}, + type = {Master's thesis} +} +``` + +## How To Contribute + +We welcome contributions! If you'd like to contribute to this repository, please +feel free to send a pull request. + +### Setting Up Your Development Environment + +To set up a local development environment with all the necessary tools to build +the document, you have two options: + +1. **Using Nix:** Load the default shell environment by running: `nix develop` + +2. **Using DevContainer:** You can also set up a complete development + environment using [DevContainer]. + +The master thesis is written using [Typst], a modern typesetting system. The +primary language of the document is British English. + +[DevContainer]: https://containers.dev/ +[Nix]: https://nixos.org/ +[Typst]: https://typst.app/ +[University Of Mons]: https://www.umons.ac.be/ +[tom mens university website]: https://informatique.umons.ac.be/perso/Mens.Tom/ +[CC BY 4.0]: https://creativecommons.org/licenses/by/4.0/ +[HL3]: https://firstdonoharm.dev/version/3/0/full.html diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..3078304 --- /dev/null +++ b/flake.lock @@ -0,0 +1,217 @@ +{ + "nodes": { + "crane": { + "inputs": { + "nixpkgs": [ + "typst-dev", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1698166613, + "narHash": "sha256-y4rdN4flxRiROqNi1waMYIZj/Fs7L2OrszFk/1ry9vU=", + "owner": "ipetkov", + "repo": "crane", + "rev": "b7db46f0f1751f7b1d1911f6be7daf568ad5bc65", + "type": "github" + }, + "original": { + "owner": "ipetkov", + "repo": "crane", + "type": "github" + } + }, + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1717285511, + "narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "flake-parts", + "type": "github" + } + }, + "flake-parts_2": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib_2" + }, + "locked": { + "lastModified": 1696343447, + "narHash": "sha256-B2xAZKLkkeRFG5XcHHSXXcP7To9Xzr59KXeZiRf4vdQ=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "c9afaba3dfa4085dbd2ccb38dfade5141e33d9d4", + "type": "github" + }, + "original": { + "id": "flake-parts", + "type": "indirect" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1718714799, + "narHash": "sha256-FUZpz9rg3gL8NVPKbqU8ei1VkPLsTIfAJ2fdAf5qjak=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "c00d587b1a1afbf200b1d8f0b0e4ba9deb1c7f0e", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs-lib": { + "locked": { + "lastModified": 1717284937, + "narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=", + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz" + } + }, + "nixpkgs-lib_2": { + "locked": { + "dir": "lib", + "lastModified": 1696019113, + "narHash": "sha256-X3+DKYWJm93DRSdC5M6K5hLqzSya9BjibtBsuARoPco=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "f5892ddac112a1e9b3612c39af1b72987ee5783a", + "type": "github" + }, + "original": { + "dir": "lib", + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs-unstable": { + "locked": { + "lastModified": 1718870667, + "narHash": "sha256-jab3Kpc8O1z3qxwVsCMHL4+18n5Wy/HHKyu1fcsF7gs=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "9b10b8f00cb5494795e5f51b39210fed4d2b0748", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_2": { + "locked": { + "lastModified": 1706371002, + "narHash": "sha256-dwuorKimqSYgyu8Cw6ncKhyQjUDOyuXoxDTVmAXq88s=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "c002c6aa977ad22c60398daaa9be52f2203d0006", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-parts": "flake-parts", + "nixpkgs": "nixpkgs", + "nixpkgs-unstable": "nixpkgs-unstable", + "systems": "systems", + "typst-dev": "typst-dev", + "typst-packages": "typst-packages" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "systems_2": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "typst-dev": { + "inputs": { + "crane": "crane", + "flake-parts": "flake-parts_2", + "nixpkgs": "nixpkgs_2", + "systems": "systems_2" + }, + "locked": { + "lastModified": 1718626524, + "narHash": "sha256-DgGQcuRfsduyOxp6Q+IevT8HiK008scjCOwmeYigbzI=", + "owner": "typst", + "repo": "typst", + "rev": "a2c980715958bc3fd71e1f0a5975fea3f5b63b85", + "type": "github" + }, + "original": { + "owner": "typst", + "repo": "typst", + "type": "github" + } + }, + "typst-packages": { + "flake": false, + "locked": { + "lastModified": 1718958452, + "narHash": "sha256-S+HIGat3lkMfzxemi4qg8pEavwEmRFOGS7J6HEDOwR0=", + "owner": "typst", + "repo": "packages", + "rev": "470cdbf2b0b8fc2a96b91f4e5e8ccbbf53f22dd9", + "type": "github" + }, + "original": { + "owner": "typst", + "repo": "packages", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..68ca930 --- /dev/null +++ b/flake.nix @@ -0,0 +1,178 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + nixpkgs-unstable.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + flake-parts.url = "github:hercules-ci/flake-parts"; + systems.url = "github:nix-systems/default"; + typst-dev.url = "github:typst/typst"; + typst-packages = { + flake = false; + url = "github:typst/packages"; + }; + }; + + outputs = + inputs@{ ... }: + inputs.flake-parts.lib.mkFlake { inherit inputs; } { + systems = import inputs.systems; + + imports = [ + ./nix/imports/pkgs.nix + ./nix/imports/overlay.nix + ./nix/imports/formatter.nix + ]; + + perSystem = + { pkgs, lib, ... }: + let + # Change here to typst-dev if needed + typst = pkgs.nixpkgs-unstable.typst; + + fontsConf = pkgs.symlinkJoin { + name = "typst-fonts"; + paths = with pkgs; [ + inriafonts + fg-virgil + liberation_ttf + inconsolata-nerdfont + newcomputermodern + ]; + }; + + typst-wrapper = pkgs.typst-wrapper typst pkgs.typst-packages fontsConf; + typstyle = pkgs.typstyle; + + mkBuildDocumentDrv = + documentName: + pkgs.stdenvNoCC.mkDerivation { + name = "build-" + documentName; + + src = pkgs.lib.cleanSource ./.; + + buildInputs = [ typst-wrapper ]; + + buildPhase = '' + runHook preBuild + + ${lib.getExe typst-wrapper} \ + compile \ + --root ./. \ + --input rev="${inputs.self.rev or ""}" \ + --input shortRev="${inputs.self.shortRev or ""}" \ + --input builddate="$(date -u -d @${toString (inputs.self.lastModified or "")})" \ + --font-path ${fontsConf} \ + ./src/${documentName}/main.typ \ + ${documentName}.pdf + + runHook postBuild + ''; + + installPhase = '' + runHook preInstall + + install -m640 -D ${documentName}.* -t $out + + runHook postInstall + ''; + }; + + mkBuildDocumentScript = + documentName: + pkgs.writeShellApplication { + name = "build-${documentName}"; + + runtimeInputs = [ typst-wrapper ]; + + text = '' + ${lib.getExe typst-wrapper} \ + compile \ + --root ./. \ + --input rev="${inputs.self.rev or ""}" \ + --input shortRev="${inputs.self.shortRev or ""}" \ + --input builddate="$(date -u -d @${toString (inputs.self.lastModified or "")})" \ + --font-path ${fontsConf} \ + ./src/${documentName}/main.typ \ + ${documentName}.pdf + ''; + }; + + mkWatchDocumentScript = + documentName: + pkgs.writeShellApplication { + name = "watch-${documentName}"; + + runtimeInputs = [ typst-wrapper ]; + + text = '' + ${lib.getExe typst-wrapper} \ + watch \ + --root ./. \ + --input rev="${inputs.self.rev or ""}" \ + --input shortRev="${inputs.self.shortRev or ""}" \ + --input builddate="$(date -u -d @${toString (inputs.self.lastModified or "")})" \ + --font-path ${fontsConf} \ + ./src/${documentName}/main.typ \ + ${documentName}.pdf + ''; + }; + + documentDrvs = lib.genAttrs (lib.attrNames ( + lib.filterAttrs (k: v: (v == "directory")) (builtins.readDir ./src) + )) (d: mkBuildDocumentDrv d); + + signPDF = pkgs.writeShellApplication { + name = "sign-pdf"; + + runtimeInputs = [ pkgs.open-pdf-sign ]; + + text = '' + open-pdf-sign \ + --certificate cert.pem \ + --key private-key.pem \ + --no-hint \ + --timestamp \ + --tsa http://timestamp.digicert.com \ + --baseline-lt \ + --add-page \ + --page \ + -1 \ + --width 19 \ + "$@" + ''; + }; + + scriptDrvs = { "sign-pdf" = signPDF; } // lib.foldl' ( + a: i: + a + // { + "build-${i}" = mkBuildDocumentScript i; + "watch-${i}" = mkWatchDocumentScript i; + } + ) { } (lib.attrNames documentDrvs); + in + { + packages = documentDrvs; + + devShells.default = pkgs.mkShellNoCC { + packages = (lib.attrValues scriptDrvs) ++ [ + typst + typst-wrapper + pkgs.gnuplot + pkgs.pympress + ]; + + shellHook = '' + echo "Typst version: ${typst.version}" + echo "Typst bin: ${lib.getExe typst}" + echo "Typst wrapper bin: ${lib.getExe typst-wrapper}" + echo "Typst packages directory: ${pkgs.typst-packages}" + echo "Typst fonts directory: ${fontsConf}" + ''; + + env = { + TYPST_FONT_PATHS = fontsConf; + }; + }; + }; + }; +} diff --git a/lib/scenario-1/Makefile b/lib/scenario-1/Makefile new file mode 100644 index 0000000..3cafd3f --- /dev/null +++ b/lib/scenario-1/Makefile @@ -0,0 +1,11 @@ +clean: + rm -rf datetime + +build: clean + gcc src/datetime.c -o datetime + +check: + nix hash path datetime + +run: + ./datetime diff --git a/lib/scenario-1/src/datetime.c b/lib/scenario-1/src/datetime.c new file mode 100644 index 0000000..c6e1bd2 --- /dev/null +++ b/lib/scenario-1/src/datetime.c @@ -0,0 +1,10 @@ +#include + +int main() { + printf( + "Built the %s at %s.\n", + __DATE__, + __TIME__ + ); + return 0; +} diff --git a/lib/scenario-2/Dockerfile b/lib/scenario-2/Dockerfile new file mode 100644 index 0000000..96d204b --- /dev/null +++ b/lib/scenario-2/Dockerfile @@ -0,0 +1,10 @@ +FROM alpine@sha256:c5b1261d6d3e43071626931fc004f70149baeba2c8ec672bd4f27761f8e1ad6b as build-env +RUN apk add --no-cache build-base +WORKDIR /app +COPY . . +RUN gcc datetime.c -o datetime + +FROM alpine@sha256:c5b1261d6d3e43071626931fc004f70149baeba2c8ec672bd4f27761f8e1ad6b +COPY --from=build-env /app/datetime /app/datetime +WORKDIR /app +CMD ["/app/datetime"] diff --git a/lib/scenario-2/Makefile b/lib/scenario-2/Makefile new file mode 100644 index 0000000..3b7db5d --- /dev/null +++ b/lib/scenario-2/Makefile @@ -0,0 +1,17 @@ +clean: + +build: clean + docker buildx build --no-cache -t datetime:latest --output type=oci,dest=image.tar . + docker load -i image.tar + +check: + docker create --quiet --name datetime datetime:latest &>/dev/null + docker cp --quiet datetime:/app/datetime datetime + docker container rm -f datetime &>/dev/null + + nix hash path datetime + rm -rf datetime + nix hash path image.tar + +run: + docker run datetime:latest diff --git a/lib/scenario-2/datetime.c b/lib/scenario-2/datetime.c new file mode 100644 index 0000000..c6e1bd2 --- /dev/null +++ b/lib/scenario-2/datetime.c @@ -0,0 +1,10 @@ +#include + +int main() { + printf( + "Built the %s at %s.\n", + __DATE__, + __TIME__ + ); + return 0; +} diff --git a/lib/scenario-3/Makefile b/lib/scenario-3/Makefile new file mode 100644 index 0000000..ae0caad --- /dev/null +++ b/lib/scenario-3/Makefile @@ -0,0 +1,10 @@ +clean: + +build: clean + guix time-machine --commit=10f3dd0e9e06d71d1bc1615c6a60cc3aa1ad1ff4 -- build -f guix.scm + +check: + nix hash path $$(guix time-machine --commit=10f3dd0e9e06d71d1bc1615c6a60cc3aa1ad1ff4 -- build -f guix.scm) + +run: + $$(guix time-machine --commit=10f3dd0e9e06d71d1bc1615c6a60cc3aa1ad1ff4 -- build -f guix.scm)/bin/datetime diff --git a/lib/scenario-3/guix.scm b/lib/scenario-3/guix.scm new file mode 100644 index 0000000..5fd857b --- /dev/null +++ b/lib/scenario-3/guix.scm @@ -0,0 +1,27 @@ +(use-modules (guix) + (guix build-system gnu)) + +(define-public datetime + (package + (name "datetime") + (version "1.0") + (source (local-file "./src" #:recursive? #t)) + (build-system gnu-build-system) + (arguments + '( + #:tests? #f + #:phases + (modify-phases %standard-phases + (delete 'configure) + (replace 'build + (lambda _ (invoke "gcc" "datetime.c" "-o" "datetime"))) + (replace 'install + (lambda* (#:key outputs #:allow-other-keys) + (let ((out (assoc-ref outputs "out"))) + (install-file "datetime" (string-append out "/bin")))))))) + (synopsis "DateTime Program") + (description "This package contains a simple program that shows the current date and time.") + (home-page #f) + (license #f))) + +datetime diff --git a/lib/scenario-3/src/datetime.c b/lib/scenario-3/src/datetime.c new file mode 100644 index 0000000..c6e1bd2 --- /dev/null +++ b/lib/scenario-3/src/datetime.c @@ -0,0 +1,10 @@ +#include + +int main() { + printf( + "Built the %s at %s.\n", + __DATE__, + __TIME__ + ); + return 0; +} diff --git a/lib/scenario-4/Makefile b/lib/scenario-4/Makefile new file mode 100644 index 0000000..2db6577 --- /dev/null +++ b/lib/scenario-4/Makefile @@ -0,0 +1,14 @@ +clean: + rm -rf output result + nix-collect-garbage -d + +build: clean + nix-build + +check: + $$(nix build --quiet --no-link --print-out-paths nixpkgs#coreutils)/bin/cp -r --dereference "$$(readlink -f result)" "output" + nix hash path output + chmod 777 output + +run: + ./result/datetime diff --git a/lib/scenario-4/default.nix b/lib/scenario-4/default.nix new file mode 100644 index 0000000..cbd2fb1 --- /dev/null +++ b/lib/scenario-4/default.nix @@ -0,0 +1,17 @@ +{ + pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/7872526e9c5332274ea5932a0c3270d6e4724f3b.tar.gz") { } +}: + +pkgs.stdenv.mkDerivation { + name = "datetime"; + + src = ./src; + + buildPhase = '' + $CC datetime.c -o datetime + ''; + + installPhase = '' + install -D datetime $out/bin/datetime + ''; +} diff --git a/lib/scenario-4/src/datetime.c b/lib/scenario-4/src/datetime.c new file mode 100644 index 0000000..c6e1bd2 --- /dev/null +++ b/lib/scenario-4/src/datetime.c @@ -0,0 +1,10 @@ +#include + +int main() { + printf( + "Built the %s at %s.\n", + __DATE__, + __TIME__ + ); + return 0; +} diff --git a/lib/scenario-5/Makefile b/lib/scenario-5/Makefile new file mode 100644 index 0000000..56e2277 --- /dev/null +++ b/lib/scenario-5/Makefile @@ -0,0 +1,14 @@ +clean: + rm -rf output result + nix-collect-garbage -d + +build: clean + nix build + +check: + $$(nix build --quiet --no-link --print-out-paths nixpkgs#coreutils)/bin/cp -r --dereference "$$(readlink -f result)" "output" + nix hash path output + chmod 777 output + +run: + ./result/datetime diff --git a/lib/scenario-5/flake.lock b/lib/scenario-5/flake.lock new file mode 100644 index 0000000..02ae899 --- /dev/null +++ b/lib/scenario-5/flake.lock @@ -0,0 +1,79 @@ +{ + "nodes": { + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1709336216, + "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2", + "type": "github" + }, + "original": { + "id": "flake-parts", + "type": "indirect" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1710889954, + "narHash": "sha256-Pr6F5Pmd7JnNEMHHmspZ0qVqIBVxyZ13ik1pJtm2QXk=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "7872526e9c5332274ea5932a0c3270d6e4724f3b", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs-lib": { + "locked": { + "dir": "lib", + "lastModified": 1709237383, + "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8", + "type": "github" + }, + "original": { + "dir": "lib", + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-parts": "flake-parts", + "nixpkgs": "nixpkgs", + "systems": "systems" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/lib/scenario-5/flake.nix b/lib/scenario-5/flake.nix new file mode 100644 index 0000000..f828897 --- /dev/null +++ b/lib/scenario-5/flake.nix @@ -0,0 +1,27 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + flake-parts.url = "github:hercules-ci/flake-parts"; + }; + + outputs = inputs @ { flake-parts, ... }: flake-parts.lib.mkFlake { inherit inputs; } { + systems = [ "x86_64-linux" "x86_64-darwin" "aarch64-darwin" ]; + + perSystem = { pkgs, ... }: { + packages.default = pkgs.stdenv.mkDerivation { + name = "datetime"; + + src = ./src; + + buildPhase = '' + $CC datetime.c -o datetime + ''; + + installPhase = '' + install -D datetime $out/bin/datetime + ''; + }; + }; + + }; +} diff --git a/lib/scenario-5/src/datetime.c b/lib/scenario-5/src/datetime.c new file mode 100644 index 0000000..c6e1bd2 --- /dev/null +++ b/lib/scenario-5/src/datetime.c @@ -0,0 +1,10 @@ +#include + +int main() { + printf( + "Built the %s at %s.\n", + __DATE__, + __TIME__ + ); + return 0; +} diff --git a/lib/scenario-6/Makefile b/lib/scenario-6/Makefile new file mode 100644 index 0000000..933ac3b --- /dev/null +++ b/lib/scenario-6/Makefile @@ -0,0 +1,15 @@ +clean: + rm -rf output result + nix-collect-garbage -d + +build: clean + nix build + gunzip -c result | docker load + +check: + $$(nix build --quiet --no-link --print-out-paths nixpkgs#coreutils)/bin/cp -r --dereference "$$(readlink -f result)" "output" + nix hash path output + chmod 777 output + +run: + docker run datetime:latest diff --git a/lib/scenario-6/flake.lock b/lib/scenario-6/flake.lock new file mode 100644 index 0000000..02ae899 --- /dev/null +++ b/lib/scenario-6/flake.lock @@ -0,0 +1,79 @@ +{ + "nodes": { + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1709336216, + "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2", + "type": "github" + }, + "original": { + "id": "flake-parts", + "type": "indirect" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1710889954, + "narHash": "sha256-Pr6F5Pmd7JnNEMHHmspZ0qVqIBVxyZ13ik1pJtm2QXk=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "7872526e9c5332274ea5932a0c3270d6e4724f3b", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs-lib": { + "locked": { + "dir": "lib", + "lastModified": 1709237383, + "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8", + "type": "github" + }, + "original": { + "dir": "lib", + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-parts": "flake-parts", + "nixpkgs": "nixpkgs", + "systems": "systems" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/lib/scenario-6/flake.nix b/lib/scenario-6/flake.nix new file mode 100644 index 0000000..64e99a0 --- /dev/null +++ b/lib/scenario-6/flake.nix @@ -0,0 +1,35 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + flake-parts.url = "github:hercules-ci/flake-parts"; + }; + + outputs = inputs @ { flake-parts, ... }: flake-parts.lib.mkFlake { inherit inputs; } { + systems = [ "x86_64-linux" "x86_64-darwin" "aarch64-darwin" ]; + + perSystem = { pkgs, ... }: { + packages.default = + let + datetime = pkgs.stdenv.mkDerivation { + name = "datetime"; + + src = ./src; + + buildPhase = '' + $CC datetime.c -o datetime + ''; + + installPhase = '' + install -D datetime $out/bin/datetime + ''; + }; + in + pkgs.dockerTools.buildLayeredImage { + name = "datetime"; + tag = "latest"; + contents = [ datetime ]; + config.Cmd = [ "/datetime" ]; + }; + }; + }; +} diff --git a/lib/scenario-6/src/datetime.c b/lib/scenario-6/src/datetime.c new file mode 100644 index 0000000..c6e1bd2 --- /dev/null +++ b/lib/scenario-6/src/datetime.c @@ -0,0 +1,10 @@ +#include + +int main() { + printf( + "Built the %s at %s.\n", + __DATE__, + __TIME__ + ); + return 0; +} diff --git a/lib/scenario-7/Makefile b/lib/scenario-7/Makefile new file mode 100644 index 0000000..f7f0439 --- /dev/null +++ b/lib/scenario-7/Makefile @@ -0,0 +1,11 @@ +clean: + rm -rf *.pdf + +build: clean + docker run --entrypoint /bin/typst --mount type=bind,source="$$(pwd)"/src,target=/src ghcr.io/typst/typst:latest compile /src/hello-world.typst + +check: + nix hash path src/hello-world.pdf + +run: + nix hash path src/hello-world.pdf diff --git a/lib/scenario-7/src/hello-world.typst b/lib/scenario-7/src/hello-world.typst new file mode 100644 index 0000000..a93f210 --- /dev/null +++ b/lib/scenario-7/src/hello-world.typst @@ -0,0 +1,24 @@ +#set page(width: 10cm, height: auto) +#set heading(numbering: "1.") + += Fibonacci sequence +The Fibonacci sequence is defined through the +recurrence relation $F_n = F_(n-1) + F_(n-2)$. +It can also be expressed in _closed form:_ + +$ F_n = round(1 / sqrt(5) phi.alt^n), quad phi.alt = (1 + sqrt(5)) / 2 $ + +#let count = 8 +#let nums = range(1, count + 1) +#let fib(n) = ( + if n <= 2 { 1 } + else { fib(n - 1) + fib(n - 2) } +) + +The first #count numbers of the sequence are: + +#align(center, table( + columns: count, + ..nums.map(n => $F_#n$), + ..nums.map(n => str(fib(n))), +)) diff --git a/lib/scenario-8/Makefile b/lib/scenario-8/Makefile new file mode 100644 index 0000000..1fc10d0 --- /dev/null +++ b/lib/scenario-8/Makefile @@ -0,0 +1,14 @@ +clean: + rm -rf output result + nix-collect-garbage -d + +build: clean + nix build + +check: + $$(nix build --quiet --no-link --print-out-paths nixpkgs#coreutils)/bin/cp -r --dereference "$$(readlink -f result)" "output" + nix hash path output + chmod 777 output + +run: + nix hash path output diff --git a/lib/scenario-8/flake.lock b/lib/scenario-8/flake.lock new file mode 100644 index 0000000..9f751b9 --- /dev/null +++ b/lib/scenario-8/flake.lock @@ -0,0 +1,58 @@ +{ + "nodes": { + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1715865404, + "narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "flake-parts", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1710889954, + "narHash": "sha256-Pr6F5Pmd7JnNEMHHmspZ0qVqIBVxyZ13ik1pJtm2QXk=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "7872526e9c5332274ea5932a0c3270d6e4724f3b", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs-lib": { + "locked": { + "lastModified": 1715709767, + "narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=", + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" + } + }, + "root": { + "inputs": { + "flake-parts": "flake-parts", + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/lib/scenario-8/flake.nix b/lib/scenario-8/flake.nix new file mode 100644 index 0000000..a850d11 --- /dev/null +++ b/lib/scenario-8/flake.nix @@ -0,0 +1,28 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + flake-parts.url = "github:hercules-ci/flake-parts"; + }; + + outputs = inputs @ { flake-parts, ... }: flake-parts.lib.mkFlake { inherit inputs; } { + systems = [ "x86_64-linux" "x86_64-darwin" "aarch64-darwin" ]; + + perSystem = { pkgs, ... }: { + packages.default = pkgs.stdenv.mkDerivation { + name = "typst-hello-world"; + + src = ./src; + + nativeBuildInputs = [ pkgs.typst ]; + + buildPhase = '' + typst compile hello-world.typst hello-world.pdf + ''; + + installPhase = '' + install -D hello-world.pdf $out/hello-world.pdf + ''; + }; + }; + }; +} diff --git a/lib/scenario-8/src/hello-world.typst b/lib/scenario-8/src/hello-world.typst new file mode 100644 index 0000000..a93f210 --- /dev/null +++ b/lib/scenario-8/src/hello-world.typst @@ -0,0 +1,24 @@ +#set page(width: 10cm, height: auto) +#set heading(numbering: "1.") + += Fibonacci sequence +The Fibonacci sequence is defined through the +recurrence relation $F_n = F_(n-1) + F_(n-2)$. +It can also be expressed in _closed form:_ + +$ F_n = round(1 / sqrt(5) phi.alt^n), quad phi.alt = (1 + sqrt(5)) / 2 $ + +#let count = 8 +#let nums = range(1, count + 1) +#let fib(n) = ( + if n <= 2 { 1 } + else { fib(n - 1) + fib(n - 2) } +) + +The first #count numbers of the sequence are: + +#align(center, table( + columns: count, + ..nums.map(n => $F_#n$), + ..nums.map(n => str(fib(n))), +)) diff --git a/lib/scenario-9/Makefile b/lib/scenario-9/Makefile new file mode 100644 index 0000000..1fc10d0 --- /dev/null +++ b/lib/scenario-9/Makefile @@ -0,0 +1,14 @@ +clean: + rm -rf output result + nix-collect-garbage -d + +build: clean + nix build + +check: + $$(nix build --quiet --no-link --print-out-paths nixpkgs#coreutils)/bin/cp -r --dereference "$$(readlink -f result)" "output" + nix hash path output + chmod 777 output + +run: + nix hash path output diff --git a/lib/scenario-9/flake.lock b/lib/scenario-9/flake.lock new file mode 100644 index 0000000..9f751b9 --- /dev/null +++ b/lib/scenario-9/flake.lock @@ -0,0 +1,58 @@ +{ + "nodes": { + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1715865404, + "narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "flake-parts", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1710889954, + "narHash": "sha256-Pr6F5Pmd7JnNEMHHmspZ0qVqIBVxyZ13ik1pJtm2QXk=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "7872526e9c5332274ea5932a0c3270d6e4724f3b", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs-lib": { + "locked": { + "lastModified": 1715709767, + "narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=", + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" + } + }, + "root": { + "inputs": { + "flake-parts": "flake-parts", + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/lib/scenario-9/flake.nix b/lib/scenario-9/flake.nix new file mode 100644 index 0000000..a850d11 --- /dev/null +++ b/lib/scenario-9/flake.nix @@ -0,0 +1,28 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + flake-parts.url = "github:hercules-ci/flake-parts"; + }; + + outputs = inputs @ { flake-parts, ... }: flake-parts.lib.mkFlake { inherit inputs; } { + systems = [ "x86_64-linux" "x86_64-darwin" "aarch64-darwin" ]; + + perSystem = { pkgs, ... }: { + packages.default = pkgs.stdenv.mkDerivation { + name = "typst-hello-world"; + + src = ./src; + + nativeBuildInputs = [ pkgs.typst ]; + + buildPhase = '' + typst compile hello-world.typst hello-world.pdf + ''; + + installPhase = '' + install -D hello-world.pdf $out/hello-world.pdf + ''; + }; + }; + }; +} diff --git a/lib/scenario-9/src/hello-world.typst b/lib/scenario-9/src/hello-world.typst new file mode 100644 index 0000000..1e1c214 --- /dev/null +++ b/lib/scenario-9/src/hello-world.typst @@ -0,0 +1,25 @@ +#set document(date: none) +#set page(width: 10cm, height: auto) +#set heading(numbering: "1.") + += Fibonacci sequence +The Fibonacci sequence is defined through the +recurrence relation $F_n = F_(n-1) + F_(n-2)$. +It can also be expressed in _closed form:_ + +$ F_n = round(1 / sqrt(5) phi.alt^n), quad phi.alt = (1 + sqrt(5)) / 2 $ + +#let count = 8 +#let nums = range(1, count + 1) +#let fib(n) = ( + if n <= 2 { 1 } + else { fib(n - 1) + fib(n - 2) } +) + +The first #count numbers of the sequence are: + +#align(center, table( + columns: count, + ..nums.map(n => $F_#n$), + ..nums.map(n => str(fib(n))), +)) diff --git a/nix/imports/formatter.nix b/nix/imports/formatter.nix new file mode 100644 index 0000000..d2553a2 --- /dev/null +++ b/nix/imports/formatter.nix @@ -0,0 +1,8 @@ +{ ... }: +{ + perSystem = + { pkgs, ... }: + { + formatter = pkgs.nixfmt-rfc-style; + }; +} diff --git a/nix/imports/overlay.nix b/nix/imports/overlay.nix new file mode 100644 index 0000000..23733a6 --- /dev/null +++ b/nix/imports/overlay.nix @@ -0,0 +1,9 @@ +{ inputs, ... }: +{ + flake = { + overlays.default = final: prev: { + typst-packages = prev.callPackage ../pkgs/typst-packages.nix { src = inputs.typst-packages; }; + typst-wrapper = prev.callPackage ../pkgs/typst-wrapper.nix { }; + }; + }; +} diff --git a/nix/imports/pkgs.nix b/nix/imports/pkgs.nix new file mode 100644 index 0000000..189087d --- /dev/null +++ b/nix/imports/pkgs.nix @@ -0,0 +1,17 @@ +{ inputs, self, ... }: +{ + perSystem = + { system, ... }: + { + _module.args.pkgs = import self.inputs.nixpkgs { + inherit system; + overlays = [ + inputs.typst-dev.overlays.default + inputs.self.overlays.default + (final: prev: { nixpkgs-unstable = import inputs.nixpkgs-unstable { inherit system; }; }) + ]; + config = { + }; + }; + }; +} diff --git a/nix/imports/shell.nix b/nix/imports/shell.nix new file mode 100644 index 0000000..f9059e7 --- /dev/null +++ b/nix/imports/shell.nix @@ -0,0 +1,17 @@ +{ inputs, self, ... }: +{ + perSystem = + { system, ... }: + { + _module.args.pkgs = import self.inputs.nixpkgs { + inherit system; + overlays = [ + inputs.typst-dev.overlays.default + inputs.self.overlays.default + ]; + config = { + allowUnfree = true; + }; + }; + }; +} diff --git a/nix/pkgs/typst-packages.nix b/nix/pkgs/typst-packages.nix new file mode 100644 index 0000000..ab06a39 --- /dev/null +++ b/nix/pkgs/typst-packages.nix @@ -0,0 +1,18 @@ +{ stdenvNoCC, src }: + +stdenvNoCC.mkDerivation { + name = "typst-packages"; + + inherit src; + + dontBuild = true; + + installPhase = '' + runHook preInstall + + mkdir -p $out/typst/packages + cp -r packages/preview $out/typst/packages/ + + runHook postInstall + ''; +} diff --git a/nix/pkgs/typst-wrapper.nix b/nix/pkgs/typst-wrapper.nix new file mode 100644 index 0000000..3570825 --- /dev/null +++ b/nix/pkgs/typst-wrapper.nix @@ -0,0 +1,15 @@ +{ lib, writeShellApplication, ... }: + +typstDrv: typst-packages: typstFontPaths: +writeShellApplication { + name = "typst-wrapper"; + + runtimeInputs = [ + typstDrv + typst-packages + ]; + + text = '' + TYPST_FONT_PATHS=${typstFontPaths} XDG_CACHE_HOME=${typst-packages} ${lib.getExe typstDrv} "$@" + ''; +} diff --git a/public-key.pem b/public-key.pem new file mode 100644 index 0000000..5ab74d4 --- /dev/null +++ b/public-key.pem @@ -0,0 +1,4 @@ +-----BEGIN PUBLIC KEY----- +MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEd4ocLaIaKaJ++bphe0y6P3sYkSAY +THcRpUjAdeUHHfXeFAI8hplUBwnbr7PH9BBU4HnWMeLuwDmRe3t2Ncxhrg== +-----END PUBLIC KEY----- diff --git a/resources/graphviz/my-app-not-ok.dot b/resources/graphviz/my-app-not-ok.dot new file mode 100644 index 0000000..b1bb9df --- /dev/null +++ b/resources/graphviz/my-app-not-ok.dot @@ -0,0 +1,62 @@ +digraph G { + +rankdir="BT" +node [ style="filled", margin=.05]; +edge [ dir="back" ]; +ratio=.5 + +"pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" [label = "python3-3.10.12", fillcolor = "orange", color = "orange"]; +"6947mfg2jlid97cnvzvc6cvv6wpj2yhg-bzip2-1.0.8" [label = "bzip2-1.0.8"]; +"d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4" [label = "ncurses-6.4"]; +"fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1" [label = "sqlite-3.43.1", fillcolor = "orange", color = "orange"]; +"g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1" [label = "readline-8.2p1"]; +"ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3" [label = "zlib-1.3", fillcolor = "red", color = "red"]; +"jhqflhc7k4jwz5s13cj219pvwywzc6j9-gdbm-1.23" [label = "gdbm-1.23"]; +"l7f1pf2dysadqpdxhsb9li01h5jwn5xr-openssl-3.0.10" [label = "openssl-3.0.10"]; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" [label = "glibc-2.37-8"]; +"8ny01r2xa5mv5brk9srdmv91wrjvxila-libidn2-2.3.4" [label = "libidn2-2.3.4"]; +"br1p5pan2pgmgrm81kj43qawd9b9nns1-libunistring-1.1" [label = "libunistring-1.1"]; +"ml12av0bi52w2nyrpay8l47xwm1m6i7b-libxcrypt-4.4.36" [label = "libxcrypt-4.4.36"]; +"q7gkbmmxwai8idqigl9kyv2a7vhppz92-expat-2.5.0" [label = "expat-2.5.0"]; +"rfckdjskd983ylf05jm9mlsw7y618hyr-xgcc-12.3.0-libgcc" [label = "xgcc-12.3.0-libgcc"]; +"xa1bg4dk78cx7g9zqqs0akhv0my9l7w5-xz-5.4.4" [label = "xz-5.4.4"]; +"xdqlrixlspkks50m9b0mpvag65m3pf2w-bash-5.2-p15" [label = "bash-5.2-p15"]; +"xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib" [label = "gcc-12.3.0-lib"]; +"xvxaw8q1b4dja27ljmynmc9818aagjz3-gcc-12.3.0-libgcc" [label = "gcc-12.3.0-libgcc"]; +"35badg7gpxkhyzcrdyh2dfi9wfd43phz-libffi-3.4.4" [label = "libffi-3.4.4"]; + +"35badg7gpxkhyzcrdyh2dfi9wfd43phz-libffi-3.4.4" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"6947mfg2jlid97cnvzvc6cvv6wpj2yhg-bzip2-1.0.8" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" [ color = "orange" ]; +"g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" [ color = "orange" ]; +"jhqflhc7k4jwz5s13cj219pvwywzc6j9-gdbm-1.23" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"l7f1pf2dysadqpdxhsb9li01h5jwn5xr-openssl-3.0.10" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"ml12av0bi52w2nyrpay8l47xwm1m6i7b-libxcrypt-4.4.36" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"q7gkbmmxwai8idqigl9kyv2a7vhppz92-expat-2.5.0" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"xa1bg4dk78cx7g9zqqs0akhv0my9l7w5-xz-5.4.4" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"xdqlrixlspkks50m9b0mpvag65m3pf2w-bash-5.2-p15" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; + +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "35badg7gpxkhyzcrdyh2dfi9wfd43phz-libffi-3.4.4" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "6947mfg2jlid97cnvzvc6cvv6wpj2yhg-bzip2-1.0.8" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4" []; +"ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3" -> "fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1" [ color = "orange" ]; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1" []; +"d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4" -> "g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "jhqflhc7k4jwz5s13cj219pvwywzc6j9-gdbm-1.23" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "l7f1pf2dysadqpdxhsb9li01h5jwn5xr-openssl-3.0.10" []; +"8ny01r2xa5mv5brk9srdmv91wrjvxila-libidn2-2.3.4" -> "ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" []; +"rfckdjskd983ylf05jm9mlsw7y618hyr-xgcc-12.3.0-libgcc" -> "ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" []; +"br1p5pan2pgmgrm81kj43qawd9b9nns1-libunistring-1.1" -> "8ny01r2xa5mv5brk9srdmv91wrjvxila-libidn2-2.3.4" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "ml12av0bi52w2nyrpay8l47xwm1m6i7b-libxcrypt-4.4.36" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "q7gkbmmxwai8idqigl9kyv2a7vhppz92-expat-2.5.0" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "xa1bg4dk78cx7g9zqqs0akhv0my9l7w5-xz-5.4.4" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "xdqlrixlspkks50m9b0mpvag65m3pf2w-bash-5.2-p15" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib" []; +"xvxaw8q1b4dja27ljmynmc9818aagjz3-gcc-12.3.0-libgcc" -> "xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib" []; +} diff --git a/resources/graphviz/python.dot b/resources/graphviz/python.dot new file mode 100644 index 0000000..4c85916 --- /dev/null +++ b/resources/graphviz/python.dot @@ -0,0 +1,66 @@ +digraph G { + +rankdir="BT" +node [ style="filled", margin=.05]; +edge [ dir="back" ]; +ratio=.5 + +"pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" [label = "python3-3.10.12"]; +"6947mfg2jlid97cnvzvc6cvv6wpj2yhg-bzip2-1.0.8" [label = "bzip2-1.0.8"]; +"d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4" [label = "ncurses-6.4"]; +"dk5vk3c9zknbjzzxmiglzv46qgv32gb0-tzdata-2023c" [label = "tzdata-2023c"]; +"fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1" [label = "sqlite-3.43.1"]; +"g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1" [label = "readline-8.2p1"]; +"ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3" [label = "zlib-1.3"]; +"jhqflhc7k4jwz5s13cj219pvwywzc6j9-gdbm-1.23" [label = "gdbm-1.23"]; +"l7f1pf2dysadqpdxhsb9li01h5jwn5xr-openssl-3.0.10" [label = "openssl-3.0.10"]; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" [label = "glibc-2.37-8"]; +"8ny01r2xa5mv5brk9srdmv91wrjvxila-libidn2-2.3.4" [label = "libidn2-2.3.4"]; +"br1p5pan2pgmgrm81kj43qawd9b9nns1-libunistring-1.1" [label = "libunistring-1.1"]; +"ml12av0bi52w2nyrpay8l47xwm1m6i7b-libxcrypt-4.4.36" [label = "libxcrypt-4.4.36"]; +"pfqk28f0yaq18ha10ri9d3a8z5kv8s6l-mailcap-2.1.53" [label = "mailcap-2.1.53"]; +"q7gkbmmxwai8idqigl9kyv2a7vhppz92-expat-2.5.0" [label = "expat-2.5.0"]; +"rfckdjskd983ylf05jm9mlsw7y618hyr-xgcc-12.3.0-libgcc" [label = "xgcc-12.3.0-libgcc"]; +"xa1bg4dk78cx7g9zqqs0akhv0my9l7w5-xz-5.4.4" [label = "xz-5.4.4"]; +"xdqlrixlspkks50m9b0mpvag65m3pf2w-bash-5.2-p15" [label = "bash-5.2-p15"]; +"xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib" [label = "gcc-12.3.0-lib"]; +"xvxaw8q1b4dja27ljmynmc9818aagjz3-gcc-12.3.0-libgcc" [label = "gcc-12.3.0-libgcc"]; +"35badg7gpxkhyzcrdyh2dfi9wfd43phz-libffi-3.4.4" [label = "libffi-3.4.4"]; + +"35badg7gpxkhyzcrdyh2dfi9wfd43phz-libffi-3.4.4" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"6947mfg2jlid97cnvzvc6cvv6wpj2yhg-bzip2-1.0.8" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"dk5vk3c9zknbjzzxmiglzv46qgv32gb0-tzdata-2023c" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"jhqflhc7k4jwz5s13cj219pvwywzc6j9-gdbm-1.23" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"l7f1pf2dysadqpdxhsb9li01h5jwn5xr-openssl-3.0.10" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"ml12av0bi52w2nyrpay8l47xwm1m6i7b-libxcrypt-4.4.36" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"pfqk28f0yaq18ha10ri9d3a8z5kv8s6l-mailcap-2.1.53" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"q7gkbmmxwai8idqigl9kyv2a7vhppz92-expat-2.5.0" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"xa1bg4dk78cx7g9zqqs0akhv0my9l7w5-xz-5.4.4" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"xdqlrixlspkks50m9b0mpvag65m3pf2w-bash-5.2-p15" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; +"xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib" -> "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12" []; + +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "35badg7gpxkhyzcrdyh2dfi9wfd43phz-libffi-3.4.4" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "6947mfg2jlid97cnvzvc6cvv6wpj2yhg-bzip2-1.0.8" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4" []; +"ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3" -> "fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1" []; +"d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4" -> "g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "jhqflhc7k4jwz5s13cj219pvwywzc6j9-gdbm-1.23" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "l7f1pf2dysadqpdxhsb9li01h5jwn5xr-openssl-3.0.10" []; +"8ny01r2xa5mv5brk9srdmv91wrjvxila-libidn2-2.3.4" -> "ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" []; +"rfckdjskd983ylf05jm9mlsw7y618hyr-xgcc-12.3.0-libgcc" -> "ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" []; +"br1p5pan2pgmgrm81kj43qawd9b9nns1-libunistring-1.1" -> "8ny01r2xa5mv5brk9srdmv91wrjvxila-libidn2-2.3.4" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "ml12av0bi52w2nyrpay8l47xwm1m6i7b-libxcrypt-4.4.36" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "q7gkbmmxwai8idqigl9kyv2a7vhppz92-expat-2.5.0" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "xa1bg4dk78cx7g9zqqs0akhv0my9l7w5-xz-5.4.4" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "xdqlrixlspkks50m9b0mpvag65m3pf2w-bash-5.2-p15" []; +"ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8" -> "xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib" []; +"xvxaw8q1b4dja27ljmynmc9818aagjz3-gcc-12.3.0-libgcc" -> "xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib" []; +} diff --git a/resources/graphviz/scientific-method-with-reproducibility.dot b/resources/graphviz/scientific-method-with-reproducibility.dot new file mode 100644 index 0000000..d46b245 --- /dev/null +++ b/resources/graphviz/scientific-method-with-reproducibility.dot @@ -0,0 +1,20 @@ +digraph G { + layout="twopi"; + root = "step6"; + node [shape="circle" fixedsize=true width=".8"] + graph [pad=".25" nodesep="1" ranksep="1.3"] + ordering="in" + edge [dir="back"]; + + step1 -> step0 -> step5 -> step4 -> step3 -> step2 -> step1; + + step0 -> step6 [style=invis]; + step1 -> step6 [style=invis]; + step2 -> step6 [style=invis]; + step3 -> step6 [style=invis]; + step5 -> step6 [dir=forward penwidth=3]; + step4 -> step6 [penwidth=3]; + step5 -> step6 [style=invis]; + + step6 [penwidth=3]; +} diff --git a/resources/graphviz/scientific-method.dot b/resources/graphviz/scientific-method.dot new file mode 100644 index 0000000..70bc245 --- /dev/null +++ b/resources/graphviz/scientific-method.dot @@ -0,0 +1,18 @@ +digraph G { + layout="twopi"; + root = "step6"; + node [shape="circle" fixedsize=true width=".8"] + graph [pad=".25" nodesep="1" ranksep="1.3"] + ordering="in" + edge [dir="back"]; + + step1 -> step0 -> step5 -> step4 -> step3 -> step2 -> step1; + + step0 -> step6 [style=invis]; + step1 -> step6 [style=invis]; + step2 -> step6 [style=invis]; + step3 -> step6 [style=invis]; + step4 -> step6 [style=invis]; + step5 -> step6 [style=invis]; + step6 [style="invis"]; +} diff --git a/resources/images/ORCIDiD_iconvector.svg b/resources/images/ORCIDiD_iconvector.svg new file mode 100644 index 0000000..d2309de --- /dev/null +++ b/resources/images/ORCIDiD_iconvector.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/resources/images/PXL_20230719_092843523-01.jpeg b/resources/images/PXL_20230719_092843523-01.jpeg new file mode 100644 index 0000000..f3baae6 Binary files /dev/null and b/resources/images/PXL_20230719_092843523-01.jpeg differ diff --git a/resources/images/binary.svg b/resources/images/binary.svg new file mode 100644 index 0000000..fc13297 --- /dev/null +++ b/resources/images/binary.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/resources/images/build-inputs1.svg b/resources/images/build-inputs1.svg new file mode 100644 index 0000000..8b108cf --- /dev/null +++ b/resources/images/build-inputs1.svg @@ -0,0 +1,21 @@ + + + + + + + + \ No newline at end of file diff --git a/resources/images/build-inputs2.svg b/resources/images/build-inputs2.svg new file mode 100644 index 0000000..dd9922b --- /dev/null +++ b/resources/images/build-inputs2.svg @@ -0,0 +1,591 @@ + + diff --git a/resources/images/builds.svg b/resources/images/builds.svg new file mode 100644 index 0000000..ab32c06 --- /dev/null +++ b/resources/images/builds.svg @@ -0,0 +1,21 @@ + + + + + + + + \ No newline at end of file diff --git a/resources/images/builds2.svg b/resources/images/builds2.svg new file mode 100644 index 0000000..3e9d990 --- /dev/null +++ b/resources/images/builds2.svg @@ -0,0 +1,21 @@ + + + + + + + + Checksum: 1ab...3d6Checksum: 1ab...3d6Checksum: 1ab...3d6Checksum: 1ab...3d6Checksum: 1ab...3d6Checksum: d2f...4c7 \ No newline at end of file diff --git a/resources/images/circle-check.svg b/resources/images/circle-check.svg new file mode 100644 index 0000000..7a8ecba --- /dev/null +++ b/resources/images/circle-check.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/images/circle-exclamation.svg b/resources/images/circle-exclamation.svg new file mode 100644 index 0000000..f1e1334 --- /dev/null +++ b/resources/images/circle-exclamation.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/images/circle-info.svg b/resources/images/circle-info.svg new file mode 100644 index 0000000..b7481c2 --- /dev/null +++ b/resources/images/circle-info.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/images/circle-question.svg b/resources/images/circle-question.svg new file mode 100644 index 0000000..cad7c00 --- /dev/null +++ b/resources/images/circle-question.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/images/computation-cogs.svg b/resources/images/computation-cogs.svg new file mode 100644 index 0000000..9a125e7 --- /dev/null +++ b/resources/images/computation-cogs.svg @@ -0,0 +1,17 @@ + + + + + + + + \ No newline at end of file diff --git a/resources/images/diffoscope-report.svg b/resources/images/diffoscope-report.svg new file mode 100644 index 0000000..dace9e1 --- /dev/null +++ b/resources/images/diffoscope-report.svg @@ -0,0 +1,1954 @@ + + +Qt Svg Document +Generated with Qt + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +1010 B +518 B +483 B + + + +build1 + + + + vs. + + + +build2 +strings --all --bytes=8 {} + + + +Offset 1, 16 lines modified +Offset 1, 16 lines modified +1 +/lib64/ld-linux-x86-64.so.2 +1 +/lib64/ld-linux-x86-64.so.2 +2 +__libc_start_main +2 +__libc_start_main +3 +libc.so.6 +3 +libc.so.6 +4 +GLIBC_2.2.5 +4 +GLIBC_2.2.5 +5 +GLIBC_2.34 +5 +GLIBC_2.34 +6 +/lib:/usr/lib +6 +/lib:/usr/lib +7 +__gmon_start__ +7 +__gmon_start__ +8 +18:23:34 + +9 + + + + + + + +F + + + + + + + +eb + + + +· +· + + + +1 + + + + + + + +· + + + + + + + +20 +2 + + + + + + + +3 +8 + + + + + + + +15:4 +1 + + + + + + + +:3 +2 + +9 +Mar + + + +· +· + + + +2 + + + +· + + + +2024 +10 +Built + + + +· + + + +the + + + +· + + + +%s + + + +· + + + +at + + + +· + + + +%s. +10 +Built + + + +· + + + +the + + + +· + + + +%s + + + +· + + + +at + + + +· + + + +%s. +11 +GCC: + + + +· + + + +(GNU) + + + +· + + + +13.2.0 +11 +GCC: + + + +· + + + +(GNU) + + + +· + + + +13.2.0 +12 +__abi_tag +12 +__abi_tag +13 +crtbegin.o +13 +crtbegin.o +14 +deregister_tm_clones +14 +deregister_tm_clones +15 +__do_global_dtors_aux +15 +__do_global_dtors_aux +16 +completed.0 +16 +completed.0 + + + +readelf --wide --decompress --hex-dump=.rodata {} + + + +Offset 1, 6 lines modified +Offset 1, 6 lines modified + + +1 +Hex + + + +· + + + +dump + + + +· + + + +of + + + +· + + + +section + + + +· + + + +'.rodata': +1 +Hex + + + +· + + + +dump + + + +· + + + +of + + + +· + + + +section + + + +· + + + +'.rodata': +2 + + + +· +· + + + +0x00402000 + + + +· + + + +01000200 + + + +· + + + +313 + + + + + + + +8 +3a3 + + + + + + + +2 + + + +· + + + +3 + + + + + + + +3 +3a333 + + + + + + + +4 + + + +· + + + +004 + + + + + + + +6 +6 + + + + + + + +56 +2 + + + +· + + + +....1 + + + + + + + +8 +: + + + + + + + +23 +:3 + + + + + + + +4 +. + + + + + + + +Feb +2 + + + +· +· + + + +0x00402000 + + + +· + + + +01000200 + + + +· + + + +313 + + + + + + + +5 +3a3 + + + + + + + +4 + + + +· + + + +3 + + + + + + + +1 +3a333 + + + + + + + +2 + + + +· + + + +004 + + + + + + + +d +6 + + + + + + + +17 +2 + + + +· + + + +....1 + + + + + + + +5 +: + + + + + + + +41 +:3 + + + + + + + +2 +. + + + + + + + +Mar +3 + + + +· +· + + + +0x00402010 + + + +· + + + +20203 + + + + + + + +1 +20 + + + +· + + + +3230323 + + + + + + + +3 + + + +· + + + +00427569 + + + +· + + + +6c742074 + + + +· +· +· + + + + + + + +1 + + + +· + + + +202 + + + + + + + +3 +.Built + + + +· + + + +t +3 + + + +· +· + + + +0x00402010 + + + +· + + + +20203 + + + + + + + +2 +20 + + + +· + + + +3230323 + + + + + + + +4 + + + +· + + + +00427569 + + + +· + + + +6c742074 + + + +· +· +· + + + + + + + +2 + + + +· + + + +202 + + + + + + + +4 +.Built + + + +· + + + +t +4 + + + +· +· + + + +0x00402020 + + + +· + + + +68652025 + + + +· + + + +73206174 + + + +· + + + +2025732e + + + +· + + + +0a00 + + + +· +· +· +· +· + + + +he + + + +· + + + +%s + + + +· + + + +at + + + +· + + + +%s... +4 + + + +· +· + + + +0x00402020 + + + +· + + + +68652025 + + + +· + + + +73206174 + + + +· + + + +2025732e + + + +· + + + +0a00 + + + +· +· +· +· +· + + + +he + + + +· + + + +%s + + + +· + + + +at + + + +· + + + +%s... + + + + + + diff --git a/resources/images/diffoscope-typst.svg b/resources/images/diffoscope-typst.svg new file mode 100644 index 0000000..394fa7d --- /dev/null +++ b/resources/images/diffoscope-typst.svg @@ -0,0 +1,663 @@ + + +Qt Svg Document +Generated with Qt + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +5.84 +KB +5.72 +KB + + + +/nix/store/1n4g9gsq34xzmg351r7sa87rrcazh8gf-typst-hello-world + + + + vs. + + + +/nix/store/1n4g9gsq34xzmg351r7sa87rrcazh8gf-typst-hello-world.check +/nix/store/1n4g9gsq34xzmg351r7sa87rrcazh8gf-typst-hello-world/hello-world.pdf + + + + vs. + + + +/nix/store/1n4g9gsq34xzmg351r7sa87rrcazh8gf-typst-hello-world.check/hello-world.pdf + + + +Offset 768, 17 lines modified +Offset 768, 17 lines modified +768 +</object> +768 +</object> + + +769 +<object + + + +· + + + +id="29"> +769 +<object + + + +· + + + +id="29"> +770 +<dict + + + +· + + + +size="3"> +770 +<dict + + + +· + + + +size="3"> +771 +<key>Creator</key> +771 +<key>Creator</key> +772 +<value><string + + + +· + + + +size="12">Typst + + + +· + + + +0.11.0</string></value> +772 +<value><string + + + +· + + + +size="12">Typst + + + +· + + + +0.11.0</string></value> +773 +<key>CreationDate</key> +773 +<key>CreationDate</key> +774 +<value><string + + + +· + + + +size="17">D:202403201 + + + + + + + +75712 +Z</string></value> +774 +<value><string + + + +· + + + +size="17">D:202403201 + + + + + + + +81055 +Z</string></value> +775 +<key>ModDate</key> +775 +<key>ModDate</key> +776 +<value><string + + + +· + + + +size="17">D:202403201 + + + + + + + +75712 +Z</string></value> +776 +<value><string + + + +· + + + +size="17">D:202403201 + + + + + + + +81055 +Z</string></value> +777 +</dict> +777 +</dict> +778 +</object> +778 +</object> + + +779 +<object + + + +· + + + +id="30"> +779 +<object + + + +· + + + +id="30"> +780 +<stream> +780 +<stream> +781 +<props> +781 +<props> +782 +<dict + + + +· + + + +size="3"> +782 +<dict + + + + diff --git a/resources/images/flake-vs-legacy.jpg b/resources/images/flake-vs-legacy.jpg new file mode 100644 index 0000000..22981ce Binary files /dev/null and b/resources/images/flake-vs-legacy.jpg differ diff --git a/resources/images/hello-world.svg b/resources/images/hello-world.svg new file mode 100644 index 0000000..15e93f9 --- /dev/null +++ b/resources/images/hello-world.svg @@ -0,0 +1,836 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/resources/images/highlighter-solid.svg b/resources/images/highlighter-solid.svg new file mode 100644 index 0000000..58bd82d --- /dev/null +++ b/resources/images/highlighter-solid.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/images/inputs-cube-2.svg b/resources/images/inputs-cube-2.svg new file mode 100644 index 0000000..86a03f9 --- /dev/null +++ b/resources/images/inputs-cube-2.svg @@ -0,0 +1,21 @@ + + + + + + + + \ No newline at end of file diff --git a/resources/images/inputs-cube-3.svg b/resources/images/inputs-cube-3.svg new file mode 100644 index 0000000..4933169 --- /dev/null +++ b/resources/images/inputs-cube-3.svg @@ -0,0 +1,21 @@ + + + + + + + + \ No newline at end of file diff --git a/resources/images/inputs-cube.svg b/resources/images/inputs-cube.svg new file mode 100644 index 0000000..32711cd --- /dev/null +++ b/resources/images/inputs-cube.svg @@ -0,0 +1,3 @@ + + + diff --git a/resources/images/inputs-icon.svg b/resources/images/inputs-icon.svg new file mode 100644 index 0000000..7df2235 --- /dev/null +++ b/resources/images/inputs-icon.svg @@ -0,0 +1,17 @@ + + + + + + + + \ No newline at end of file diff --git a/resources/images/lightbulb-solid.svg b/resources/images/lightbulb-solid.svg new file mode 100644 index 0000000..5fc5e35 --- /dev/null +++ b/resources/images/lightbulb-solid.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/images/note-sticky.svg b/resources/images/note-sticky.svg new file mode 100644 index 0000000..584f481 --- /dev/null +++ b/resources/images/note-sticky.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/images/quote-left.svg b/resources/images/quote-left.svg new file mode 100644 index 0000000..eb1adfe --- /dev/null +++ b/resources/images/quote-left.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/images/quote-right.svg b/resources/images/quote-right.svg new file mode 100644 index 0000000..75e111d --- /dev/null +++ b/resources/images/quote-right.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/images/rules.svg b/resources/images/rules.svg new file mode 100644 index 0000000..8d2fa6b --- /dev/null +++ b/resources/images/rules.svg @@ -0,0 +1,21 @@ + + + + + + + + \ No newline at end of file diff --git a/resources/images/security-independent-builds.svg b/resources/images/security-independent-builds.svg new file mode 100644 index 0000000..a28d31f --- /dev/null +++ b/resources/images/security-independent-builds.svg @@ -0,0 +1,10 @@ + + + + + Independent buildIndependent buildChecksum:4e14e...4c0a9Source codeBuildArtefactsSoftware VendorLinux distribution, app store, etcSource codeBuildArtefactsIndependent buildChecksum:4e14e...4c0a9Checksum:4e14e...4c0a9Source codeChecksum:806e7...9c271ArtefactsEnd-user systemChecksum:806e7...9c271Binary distributioncomparecomparecompare diff --git a/resources/images/sourcecode.svg b/resources/images/sourcecode.svg new file mode 100644 index 0000000..b7d9caa --- /dev/null +++ b/resources/images/sourcecode.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/resources/sourcecode/bash/bash-diffoscope-comparison.log b/resources/sourcecode/bash/bash-diffoscope-comparison.log new file mode 100644 index 0000000..e3bc5d0 --- /dev/null +++ b/resources/sourcecode/bash/bash-diffoscope-comparison.log @@ -0,0 +1 @@ +$ diffoscope build1 build2 diff --git a/resources/sourcecode/bash/bash-fixing-builds.log b/resources/sourcecode/bash/bash-fixing-builds.log new file mode 100644 index 0000000..e71e344 --- /dev/null +++ b/resources/sourcecode/bash/bash-fixing-builds.log @@ -0,0 +1,6 @@ +$ export SOURCE_DATE_EPOCH=1709373544 +$ gcc -o build1 datetime.c +$ gcc -o build2 datetime.c +$ sha256sum build* +98f0419783bb3b06b45eaf5cc0efeef9408c4dad1e9eec9eb153dc7a6cc6962f build1 +98f0419783bb3b06b45eaf5cc0efeef9408c4dad1e9eec9eb153dc7a6cc6962f build2 diff --git a/resources/sourcecode/bash/bash-gcc-not-reproducible-builds.log b/resources/sourcecode/bash/bash-gcc-not-reproducible-builds.log new file mode 100644 index 0000000..218ab3a --- /dev/null +++ b/resources/sourcecode/bash/bash-gcc-not-reproducible-builds.log @@ -0,0 +1,5 @@ +$ gcc -o build1 datetime.c +$ gcc -o build2 datetime.c +$ sha256sum build* +5d0b1ae966c719862971bd51b4c035a478863a409caa605b2c529d45f2ac137d build1 +7cbfd989b49c7336dc495a055db223253f0c1d6dc232b66b0fe0f7b6103c274c build2 diff --git a/resources/sourcecode/composer.json b/resources/sourcecode/composer.json new file mode 100644 index 0000000..bada0e3 --- /dev/null +++ b/resources/sourcecode/composer.json @@ -0,0 +1,7 @@ +{ + "name": "awesome/php-library", + "require": { + "foo/http": "^1", + "foo/bar": "1.2.3" + } +} diff --git a/resources/sourcecode/date-format-flags.log b/resources/sourcecode/date-format-flags.log new file mode 100644 index 0000000..d016b2b --- /dev/null +++ b/resources/sourcecode/date-format-flags.log @@ -0,0 +1,6 @@ +$ date -d@2147483647 +Tue Jan 19 04:14:07 AM CET 2038 +$ date -d@2147483647 -u +Tue Jan 19 03:14:07 AM UTC 2038 +$ LC_ALL=C date -d@2147483647 -u +Tue Jan 19 03:14:07 UTC 2038 diff --git a/resources/sourcecode/datetime.c b/resources/sourcecode/datetime.c new file mode 100644 index 0000000..c6e1bd2 --- /dev/null +++ b/resources/sourcecode/datetime.c @@ -0,0 +1,10 @@ +#include + +int main() { + printf( + "Built the %s at %s.\n", + __DATE__, + __TIME__ + ); + return 0; +} diff --git a/resources/sourcecode/datetime.c.log b/resources/sourcecode/datetime.c.log new file mode 100644 index 0000000..cb79025 --- /dev/null +++ b/resources/sourcecode/datetime.c.log @@ -0,0 +1,11 @@ +$ gcc datetime.c -o datetime +$ sha256sum datetime +a123b...c8dba datetime +$ rm datetime +$ gcc datetime.c -o datetime +$ sha256sum datetime +937da...1284f datetime +$ ./datetime +Built the Nov 21 2023 at 17:19:34. +$ ./datetime +Built the Nov 21 2023 at 17:19:34. diff --git a/resources/sourcecode/debian-nix.log b/resources/sourcecode/debian-nix.log new file mode 100644 index 0000000..6d7e4c5 --- /dev/null +++ b/resources/sourcecode/debian-nix.log @@ -0,0 +1,8 @@ +$ lsb_release -a +Distributor ID: Debian +Description: Debian GNU/Linux 12 (bookworm) +Release: 12 +Codename: bookworm +$ sudo apt-get -yq install nix +$ nix --version +nix (Nix) 2.8.0 diff --git a/resources/sourcecode/example-makefile b/resources/sourcecode/example-makefile new file mode 100644 index 0000000..742e6eb --- /dev/null +++ b/resources/sourcecode/example-makefile @@ -0,0 +1,19 @@ +clean: + # This `clean` step is a dependency of the build step + # It will remove the files and artefacts generated by the `build` step + rm -rf test.txt + +build: clean + # This `build` step will compile the source code and generate the + # output artefacts. + # It has a dependency on the `clean` step + echo "hello world" > test.txt + +check: + # This `check` step will print the checksum of the output artefacts + sha256sum test.txt + +run: + # This `run` step will execute the output artefacts and print the result + # In case of a non-executable output, it will print the content of the # file or its checksum. + cat test.txt diff --git a/resources/sourcecode/listing-typst-version.log b/resources/sourcecode/listing-typst-version.log new file mode 100644 index 0000000..da7370a --- /dev/null +++ b/resources/sourcecode/listing-typst-version.log @@ -0,0 +1,2 @@ +$ typst --version +typst 0.10.0 (f2433bd1) diff --git a/resources/sourcecode/montecarlo-pi-compilation.log b/resources/sourcecode/montecarlo-pi-compilation.log new file mode 100644 index 0000000..0824027 --- /dev/null +++ b/resources/sourcecode/montecarlo-pi-compilation.log @@ -0,0 +1,7 @@ +$ gcc montecarlo-pi.c -o montecarlo-pi +$ sha256sum montecarlo-pi +241d0d05314472c6472fd90814f253ccc1b4fd10366ab828201aab1ac1a0d376 montecarlo-pi +$ rm montecarlo-pi +$ gcc montecarlo-pi.c -o montecarlo-pi +$ sha256sum montecarlo-pi +241d0d05314472c6472fd90814f253ccc1b4fd10366ab828201aab1ac1a0d376 montecarlo-pi diff --git a/resources/sourcecode/montecarlo-pi.c b/resources/sourcecode/montecarlo-pi.c new file mode 100644 index 0000000..370cf96 --- /dev/null +++ b/resources/sourcecode/montecarlo-pi.c @@ -0,0 +1,21 @@ +#include +#include +#include + +int main(int argc, char* argv[]) { + srand(time(NULL)); + + double x,y,z,count; + int n = atoi(argv[1]); + + for (int i = 0; i < n; i++) { + x = (double) rand() / RAND_MAX; + y = (double) rand() / RAND_MAX; + z = x * x + y * y; + if (z <= 1) count++; + } + + printf("π approx.: %g", (count / n) * 4); + + return(0); +} diff --git a/resources/sourcecode/montecarlo-pi.c.log b/resources/sourcecode/montecarlo-pi.c.log new file mode 100644 index 0000000..a87a441 --- /dev/null +++ b/resources/sourcecode/montecarlo-pi.c.log @@ -0,0 +1,9 @@ +$ gcc montecarlo-pi.c -o montecarlo-pi +$ ./montecarlo-pi 10 +3.6 +$ ./montecarlo-pi 10 +4 +$ ./montecarlo-pi 1000000 +3.13989 +$ ./montecarlo-pi 1000000 +3.14048 diff --git a/resources/sourcecode/nix-typst-build-diff.log b/resources/sourcecode/nix-typst-build-diff.log new file mode 100644 index 0000000..d301586 --- /dev/null +++ b/resources/sourcecode/nix-typst-build-diff.log @@ -0,0 +1,26 @@ +--- /nix/store/1n4g9gsq34xzmg351r7sa87rrcazh8gf-typst-hello-world ++++ /nix/store/1n4g9gsq34xzmg351r7sa87rrcazh8gf-typst-hello-world.check +│ --- /nix/store/1n4g9gsq34xzmg351r7sa87rrcazh8gf-typst-hello-world/hello-world.pdf +├── +++ /nix/store/1n4g9gsq34xzmg351r7sa87rrcazh8gf-typst-hello-world.check/hello-world.pdf +│┄ 'pdftotext' not available in path. Falling back to binary comparison. +│┄ Installing the 'pypdf' Python module may produce a better output. +│ @@ -9457,17 +9457,17 @@ +│ 00024f00: 6e65 730a 2020 2f46 6972 7374 2032 3820 nes. /First 28 +│ 00024f10: 3020 520a 2020 2f4c 6173 7420 3238 2030 0 R. /Last 28 0 +│ 00024f20: 2052 0a20 202f 436f 756e 7420 310a 3e3e R. /Count 1.>> +│ 00024f30: 0a65 6e64 6f62 6a0a 0a32 3920 3020 6f62 .endobj..29 0 ob +│ 00024f40: 6a0a 3c3c 0a20 202f 4372 6561 746f 7220 j.<<. /Creator +│ 00024f50: 2854 7970 7374 2030 2e31 312e 3029 0a20 (Typst 0.11.0). +│ 00024f60: 202f 4372 6561 7469 6f6e 4461 7465 2028 /CreationDate ( +│ -00024f70: 443a 3230 3234 3033 3230 3137 3537 3132 D:20240320175712 +│ +00024f70: 443a 3230 3234 3033 3230 3138 3130 3535 D:20240320181055 +│ 00024f80: 5a29 0a20 202f 4d6f 6444 6174 6520 2844 Z). /ModDate (D +│ -00024f90: 3a32 3032 3430 3332 3031 3735 3731 325a :20240320175712Z +│ +00024f90: 3a32 3032 3430 3332 3031 3831 3035 355a :20240320181055Z +│ 00024fa0: 290a 3e3e 0a65 6e64 6f62 6a0a 0a33 3020 ).>>.endobj..30 +│ 00024fb0: 3020 6f62 6a0a 3c3c 0a20 202f 4c65 6e67 0 obj.<<. /Leng +│ 00024fc0: 7468 2039 3836 0a20 202f 5479 7065 202f th 986. /Type / +│ 00024fd0: 4d65 7461 6461 7461 0a20 202f 5375 6274 Metadata. /Subt +│ 00024fe0: 7970 6520 2f58 4d4c 0a3e 3e0a 7374 7265 ype /XML.>>.stre +│ 00024ff0: 616d 0a3c 3f78 7061 636b 6574 2062 6567 am.)[Bare compilation]], + [ + - Full control over compilation + - Direct understanding of dependencies inherited from host system + ], + [ + - Prone to #emph["it works on my machine"] issues + - Lacks isolation and dependency management + ], + table.cell(align: horizon + center, text(size: 2em)[\u{00D7}]), + table.cell(align: horizon + center, text(size: 2em)[\u{00D7}]), + table.cell(align: horizon + center, text(size: 2em)[\u{00D7}]), + table.hline(stroke: .5pt + black.lighten(75%)), + table.cell(align: horizon + left)[2. #link()[Docker]], + [ + - Better isolation and dependency management thanks to containerization + - Isolation from host system + - Popular solution, widely adopted + ], + [ + - Potential variability due to base images and package management + - Additional layer of abstraction due to containerization + ], + table.cell(align: horizon + center, text(size: 2em)[\u{223C}]), + table.cell(align: horizon + center, text(size: 2em)[\u{223C}]), + table.cell(align: horizon + center, text(size: 2em)[\u{223C}]), + table.hline(stroke: .5pt + black.lighten(75%)), + table.cell(align: horizon + left)[3. #link()[Guix]], + table.cell(align: left + horizon)[ + - Deterministic builds with explicit dependency specification + - Functional package management + - Immutable software environments + - Isolation and environment reproducibility + - No containerization overhead + ], + [ + - Steep learning curve + - Paradigm shift from traditional package management systems required + - Very limited package availability + - Unfree packages are not officially allowed + ], + table.cell(align: horizon + center, rowspan: 2, text(size: 2em)[\u{2713}]), + table.cell(align: horizon + center, rowspan: 2, text(size: 2em)[\u{2713}]), + table.cell(align: horizon + center, rowspan: 2, text(size: 2em)[\u{2713}]), + table.hline(stroke: .5pt + black.lighten(75%)), + table.cell(align: horizon + left)[4. #link()[Nix]], + table.cell(align: left + horizon)[ + - Deterministic builds with explicit dependency specification + - Functional package management + - Immutable software environments + - Isolation and environment reproducibility + - No containerization overhead + - Vast repository of packages, unfree packages are authorized. + ], + [ + - Steep learning curve + - Pradigm shift from traditional package management systems required + ], + table.footer( + table.cell( + align: right, + colspan: 6, + text(size: .7em)[ + Legend: \u{2713} = Supported, \u{223C} = Partially supported, \u{00D7} = Not supported + ], + ), + ), +) diff --git a/resources/typst/ch4-table-conclusion.typ b/resources/typst/ch4-table-conclusion.typ new file mode 100644 index 0000000..de3f779 --- /dev/null +++ b/resources/typst/ch4-table-conclusion.typ @@ -0,0 +1,92 @@ +#set align(left) + +#table( + columns: (1fr, 1fr), + stroke: none, + [#align(center)[Pros]], + table.vline(stroke: .5pt), + [#align(center)[Cons]], + table.hline(stroke: .5pt), + [ + Facilitates collaboration and onboarding: + + Reproducibility enables easier collaboration among researchers, developers, + as they can replicate and extend each other's work more efficiently. + ], + [ + Steep learning curve: + + Implementing reproducibility practices may require learning new tools and + methodologies, which can be time-consuming and challenging. + ], + table.hline(stroke: .5pt), + [ + Transparency and trust: + + By sharing the methods, data, and tools used in research and development, + other collaborators can verify and build upon the work, fostering a culture + of openness and collaboration. + ], + [ + Complexity: + + The process of making software reproducible can be complex, especially if it + hasn't been setup from the beginning of the project. + ], + table.hline(stroke: .5pt), + [ + Improves software quality: + + Reproducibility practices help in identifying and fixing bugs, improving the + software's overall quality and robustness. + ], + [ + Proliferation of package managers: + + The existence of too many package managers that + are built without reproducibility in mind can add another layer of complexity when trying to build software reproducibly. + ], + table.hline(stroke: .5pt), + [ + Enhanced reliability and validity: + + Reproducible results provide confidence that findings are accurate and not + due to random chance or specific initial conditions of a single experiment. + ], + [ + Factors limiting reproducibility: + + Factors such as proprietary software, licensing issues, and evolving + hardware can pose challenges to achieving full reproducibility. + ], + table.hline(stroke: .5pt), + [ + Security and integrity: + + Ensuring that software can be reliably rebuilt from its source helps in + detecting unauthorized changes, enhancing security, and maintaining the + integrity of the software supply chain. + ], + [ + Potential for misuse: + + Over-reliance on automated reproducibility tools can lead to complacency, + where developers might not fully understand the underlying processes and + methodologies. + ], + table.hline(stroke: .5pt), + [ + Facilitates troubleshooting and debugging: + + Reproducible experiments serve as a clear benchmark for comparison, + assisting teams in identifying discrepancies, tracing error origins, and + incrementally enhancing model performance​. + ], + [ + Potential for stagnation: + + Emphasis on reproducibility might slow down innovation as developers might + spend more time ensuring reproducibility rather than exploring new ideas and + methodologies​. + ] +) diff --git a/resources/typst/equivalence-classes-of-reproducibility.typ b/resources/typst/equivalence-classes-of-reproducibility.typ new file mode 100644 index 0000000..9448b44 --- /dev/null +++ b/resources/typst/equivalence-classes-of-reproducibility.typ @@ -0,0 +1,24 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#table( + columns: (1fr, 1fr), + align: left + horizon, + stroke: none, + table.header( + [*Equivalence class*], + table.vline(stroke: .5pt), + [*Examples*], + table.hline(stroke: .5pt), + ), + [Same phenomenon], + [Human experts], + table.hline(stroke: .5pt), + [Same statistics], + [Software like GNUplot, Matplotlib, R], + table.hline(stroke: .5pt), + [Same data], + [Checksum of file contents], + table.hline(stroke: .5pt), + [Same bits], + [Checksum of file contents and metadata], +) diff --git a/resources/typst/essawy-table.typ b/resources/typst/essawy-table.typ new file mode 100644 index 0000000..f9eb06b --- /dev/null +++ b/resources/typst/essawy-table.typ @@ -0,0 +1,55 @@ +#import "../../src/thesis/imports/preamble.typ": * +#import table: cell, header + +#{ + table( + columns: 7, + align: (right,) + (center,) * 2, + stroke: none, + table.header( + cell(rowspan: 2)[], + table.vline(stroke: .5pt), + cell(colspan: 3)[*Original*], + table.vline(stroke: .5pt), + cell(colspan: 3)[*Other*], + [*researcher*], + [*machine*], + [*data*], + [*researcher*], + [*machine*], + [*data*], + table.hline(stroke: .5pt), + ), + [*Repeatability*], + [\u{2713}], + [\u{2713}], + [\u{2713}], + [], + [], + [], + table.hline(stroke: .5pt + black.lighten(75%)), + [*Runnability*], + [\u{2713}], + [], + [\u{2713}], + [], + [\u{2713}], + [], + table.hline(stroke: .5pt + black.lighten(75%)), + [*Reproducibility*], + [], + [], + [\u{2713}], + [\u{2713}], + [\u{2713}], + [], + table.hline(stroke: .5pt + black.lighten(75%)), + [*Replicability*], + [], + [], + [], + [\u{2713}], + [\u{2713}], + [\u{2713}], + ) +} diff --git a/resources/typst/essawy.typ b/resources/typst/essawy.typ new file mode 100644 index 0000000..cf572ff --- /dev/null +++ b/resources/typst/essawy.typ @@ -0,0 +1,107 @@ +#{ + set text( + font: "Virgil 3 YOFF", + size: .9em, + ) + box[ + #grid( + columns: 1, + rows: 4, + gutter: 0pt, + polygon( + fill: blue.lighten(80%), + stroke: blue, + (0pt, 3.5cm), + (70pt, 0cm), + (140pt, 3.5cm), + ), + polygon( + fill: blue.lighten(80%), + stroke: blue, + (0pt, 2cm), + (40pt, 0pt), + (180pt, 0pt), + (220pt, 2cm), + ), + polygon( + fill: blue.lighten(80%), + stroke: blue, + (0pt, 2cm), + (40pt, 0pt), + (260pt, 0pt), + (300pt, 2cm), + ), + polygon( + fill: blue.lighten(80%), + stroke: blue, + (0pt, 2cm), + (40pt, 0pt), + (340pt, 0pt), + (380pt, 2cm), + ), + ) + + // Right line + #place(bottom + left)[ + #line(length: 320pt, angle: 55deg, start: (216pt, 0pt)) + ] + + // Left line + #place(bottom + right)[ + #line(length: 320pt, angle: -55deg, start: (-400pt, 0pt)) + ] + + // Left arrow + #place(top + left, dx: 161pt)[ + #rotate(35deg)[ + #polygon.regular(fill: black, size: 10pt, vertices: 3) + ] + ] + + // Text left + #place(bottom + left, dx: 50pt, dy: -125pt)[ + #rotate(-55deg)[ + time + ] + ] + + // Right arrow + #place(top + right, dx: -161pt)[ + #rotate(-35deg)[ + #polygon.regular(fill: black, size: 10pt, vertices: 3) + ] + ] + + // Text right + #place(bottom + left, dx: 300pt, dy: -130pt)[ + #rotate(55deg)[ + effort + ] + ] + + #place(center, dy: -45pt)[ + Repeatability\ + + Original researcher,\ machine and data + ] + + #place(center, dy: -105pt)[ + Runnability\ + + Original researcher and data\ other machine + ] + + #place(center, dy: -160pt)[ + Reproducibility\ + + Original data\ other researcher and machine + ] + + #place(center, dy: -215pt)[ + Replicability\ + + Other researcher,\ + machine and data + ] + ] +} diff --git a/resources/typst/figure-checksum.typ b/resources/typst/figure-checksum.typ new file mode 100644 index 0000000..5666be7 --- /dev/null +++ b/resources/typst/figure-checksum.typ @@ -0,0 +1,31 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#{ + set align(center + horizon) + set text(font: "Virgil 3 YOFF") + grid( + columns: (1fr, 1fr, 1fr, 1fr, 1fr), + rows: (40pt, 25pt), + image("../../resources/images/inputs-icon.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/computation-cogs.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + { + set text( + font: "Liberation Mono", + size: .7em, + ) + box(inset: 10pt, fill: luma(230), width: 100pt, stroke: 1pt, radius: 5pt)[ + 2aae6c35c9\ + 4fcfb415db\ + e95f408b9c\ + e91ee846ed + ] + }, + "Inputs", + "", + "Checksum", + "", + "Outputs", + ) +} diff --git a/resources/typst/functions-vs-computations.typ b/resources/typst/functions-vs-computations.typ new file mode 100644 index 0000000..f4217cd --- /dev/null +++ b/resources/typst/functions-vs-computations.typ @@ -0,0 +1,29 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#table( + columns: (2fr, 3fr, 5fr), + stroke: none, + align: left + top, + table.header( + [], + table.vline(stroke: .5pt), + [*Theoretically*], + table.vline(stroke: .5pt), + [*Practically*], + table.hline(stroke: .5pt), + ), + [*Function*], + [$I -> R$], + [ + $"eval"(F,I,emptyset) -> R$ + ], + table.hline(stroke: .5pt), + [*Computation*], + [$I times E -> R$], + [ + $"eval"(F,I,E) -> R$\ + + Reproducible if and only if\ + $forall e_1,e_2 in E quad "eval"(F,I,e_1) = "eval"(F,I,e_2)$ + ], +) diff --git a/resources/typst/inputs-and-outputs-part1.typ b/resources/typst/inputs-and-outputs-part1.typ new file mode 100644 index 0000000..b6ec089 --- /dev/null +++ b/resources/typst/inputs-and-outputs-part1.typ @@ -0,0 +1,18 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#set align(center + horizon) +#set text(font: "Virgil 3 YOFF") +#grid( + columns: (1fr, 1fr, 1fr, 1fr, 1fr), + rows: (40pt, 25pt), + image("../../resources/images/inputs-cube.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/computation-cogs.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/inputs-icon.svg"), + "Inputs", + "", + "Computation", + "", + "Outputs", +) diff --git a/resources/typst/inputs-and-outputs-part2.typ b/resources/typst/inputs-and-outputs-part2.typ new file mode 100644 index 0000000..508156d --- /dev/null +++ b/resources/typst/inputs-and-outputs-part2.typ @@ -0,0 +1,18 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#set align(center + horizon) +#set text(font: "Virgil 3 YOFF") +#grid( + columns: (1fr, 1fr, 1fr, 1fr, 1fr), + rows: (40pt, 25pt), + image("../../resources/images/inputs-cube-2.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/computation-cogs.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/inputs-icon.svg"), + "Inputs", + "", + "Computation", + "", + "Outputs", +) diff --git a/resources/typst/inputs-and-outputs-part3.typ b/resources/typst/inputs-and-outputs-part3.typ new file mode 100644 index 0000000..51c9eb8 --- /dev/null +++ b/resources/typst/inputs-and-outputs-part3.typ @@ -0,0 +1,18 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#set align(center + horizon) +#set text(font: "Virgil 3 YOFF") +#grid( + columns: (1fr, 1fr, 1fr, 1fr, 1fr), + rows: (40pt, 25pt), + image("../../resources/images/inputs-cube-3.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/computation-cogs.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/inputs-icon.svg"), + "Inputs", + "", + "Computation", + "", + "Outputs", +) diff --git a/resources/typst/inputs-and-outputs-part4.typ b/resources/typst/inputs-and-outputs-part4.typ new file mode 100644 index 0000000..5f90e07 --- /dev/null +++ b/resources/typst/inputs-and-outputs-part4.typ @@ -0,0 +1,32 @@ +#import "../../src/thesis/imports/preamble.typ": * +#import "../../src/thesis/theme/colors.typ": * + +#set align(center + horizon) +#set text(font: "Virgil 3 YOFF") +#grid( + columns: (1fr, 1fr, 1fr, 1fr, 1fr), + rows: (70pt, 25pt), + { + place(top + left, dx: 15pt, dy: 9pt)[#text(fill: umons-red)[Program]] + place( + top + left, + dx: 15pt, + dy: 31pt, + )[#text(fill: umons-turquoise)[Parameters]] + place(top + left, dx: 15pt, dy: 53pt)[#text(fill: umons-grey)[Environment]] + image("../../resources/images/build-inputs1.svg") + }, + { + xarrow(sym: sym.arrow.r, width: 50pt, "") + xarrow(sym: sym.arrow.r, width: 50pt, "") + xarrow(sym: sym.arrow.r, width: 50pt, "") + }, + { + place(top + left, dx: 35pt, dy: 38pt)[#text(size: .75em)[Evaluation]] + image("../../resources/images/build-inputs2.svg") + }, + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/inputs-icon.svg"), + + "Inputs", "", "Computational environment", "", "Outputs", +) diff --git a/resources/typst/inputs-computation-outputs.typ b/resources/typst/inputs-computation-outputs.typ new file mode 100644 index 0000000..29fad0c --- /dev/null +++ b/resources/typst/inputs-computation-outputs.typ @@ -0,0 +1,15 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#set align(center + horizon) +#set text(font: "Virgil 3 YOFF") +#grid( + columns: (1fr, 1fr, 1fr, 1fr, 1fr), + rows: (40pt, 25pt), + image("../../resources/images/inputs-icon.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/computation-cogs.svg"), + xarrow(sym: sym.arrow.r, width: 50pt, ""), + image("../../resources/images/inputs-icon.svg"), + + "Inputs", "", "Computation", "", "Outputs", +) diff --git a/resources/typst/my-app-graph-not-ok.typ b/resources/typst/my-app-graph-not-ok.typ new file mode 100644 index 0000000..fbbd257 --- /dev/null +++ b/resources/typst/my-app-graph-not-ok.typ @@ -0,0 +1,33 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#{ + set text( + font: "Inconsolata Nerd Font Mono", + size: 1em, + ) + render( + read("../../resources/graphviz/my-app-not-ok.dot"), + width: 100%, + labels: ( + "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12": [my-app-1.2.3], + "6947mfg2jlid97cnvzvc6cvv6wpj2yhg-bzip2-1.0.8": [bzip2-1.0.8], + "d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4": [ncurses-6.4], + "fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1": [sqlite-3.43.1], + "g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1": [readline-8.2p1], + "ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3": [xz-5.6.1], + "jhqflhc7k4jwz5s13cj219pvwywzc6j9-gdbm-1.23": [gdbm-1.23], + "l7f1pf2dysadqpdxhsb9li01h5jwn5xr-openssl-3.0.10": [openssl-3.0.10], + "ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8": [glibc-2.37-8], + "8ny01r2xa5mv5brk9srdmv91wrjvxila-libidn2-2.3.4": [libidn2-2.3.4], + "br1p5pan2pgmgrm81kj43qawd9b9nns1-libunistring-1.1": [libunistring-1.1], + "ml12av0bi52w2nyrpay8l47xwm1m6i7b-libxcrypt-4.4.36": [libxcrypt-4.4.36], + "q7gkbmmxwai8idqigl9kyv2a7vhppz92-expat-2.5.0": [expat-2.5.0], + "rfckdjskd983ylf05jm9mlsw7y618hyr-xgcc-12.3.0-libgcc": [xgcc-12.3.0-libgcc], + "xa1bg4dk78cx7g9zqqs0akhv0my9l7w5-xz-5.4.4": [zlib-1.3], + "xdqlrixlspkks50m9b0mpvag65m3pf2w-bash-5.2-p15": [bash-5.2-p15], + "xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib": [gcc-12.3.0-lib], + "xvxaw8q1b4dja27ljmynmc9818aagjz3-gcc-12.3.0-libgcc": [gcc-12.3.0-libgcc], + "35badg7gpxkhyzcrdyh2dfi9wfd43phz-libffi-3.4.4": [libffi-3.4.4], + ), + ) +} diff --git a/resources/typst/python-graph.typ b/resources/typst/python-graph.typ new file mode 100644 index 0000000..eb31179 --- /dev/null +++ b/resources/typst/python-graph.typ @@ -0,0 +1,37 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#figure( + natural-image({ + set text( + font: "Inconsolata Nerd Font Mono", + size: 1em, + ) + render( + read("../../resources/graphviz/python.dot"), + labels: ( + "pzf6dnxg8gf04xazzjdwarm7s03cbrgz-python3-3.10.12": [python3-3.10.12], + "6947mfg2jlid97cnvzvc6cvv6wpj2yhg-bzip2-1.0.8": [bzip2-1.0.8], + "d48d0ppksa6gwxjlkwf2i93rilyv9jvq-ncurses-6.4": [ncurses-6.4], + "dk5vk3c9zknbjzzxmiglzv46qgv32gb0-tzdata-2023c": [tzdata-2023c], + "fmh3s032bcsbfcdp82zsjlmkj1kp72j6-sqlite-3.43.1": [sqlite-3.43.1], + "g3dx6xjlvkg2njyxjsx9dswx5wjvkrm5-readline-8.2p1": [readline-8.2p1], + "ig0kkzw4n2pws12dj7szjm71f1a43if6-zlib-1.3": [zlib-1.3], + "jhqflhc7k4jwz5s13cj219pvwywzc6j9-gdbm-1.23": [gdbm-1.23], + "l7f1pf2dysadqpdxhsb9li01h5jwn5xr-openssl-3.0.10": [openssl-3.0.10], + "ld03l52xq2ssn4x0g5asypsxqls40497-glibc-2.37-8": [glibc-2.37-8], + "8ny01r2xa5mv5brk9srdmv91wrjvxila-libidn2-2.3.4": [libidn2-2.3.4], + "br1p5pan2pgmgrm81kj43qawd9b9nns1-libunistring-1.1": [libunistring-1.1], + "ml12av0bi52w2nyrpay8l47xwm1m6i7b-libxcrypt-4.4.36": [libxcrypt-4.4.36], + "pfqk28f0yaq18ha10ri9d3a8z5kv8s6l-mailcap-2.1.53": [mailcap-2.1.53], + "q7gkbmmxwai8idqigl9kyv2a7vhppz92-expat-2.5.0": [expat-2.5.0], + "rfckdjskd983ylf05jm9mlsw7y618hyr-xgcc-12.3.0-libgcc": [xgcc-12.3.0-libgcc], + "xa1bg4dk78cx7g9zqqs0akhv0my9l7w5-xz-5.4.4": [xz-5.4.4], + "xdqlrixlspkks50m9b0mpvag65m3pf2w-bash-5.2-p15": [bash-5.2-p15], + "xq05361kqwzcdamcsxr4gzg8ksxrb8sg-gcc-12.3.0-lib": [gcc-12.3.0-lib], + "xvxaw8q1b4dja27ljmynmc9818aagjz3-gcc-12.3.0-libgcc": [gcc-12.3.0-libgcc], + "35badg7gpxkhyzcrdyh2dfi9wfd43phz-libffi-3.4.4": [libffi-3.4.4], + ), + ) + }), + caption: [Runtime dependencies tree of the Python 3 interpreter], +) diff --git a/resources/typst/reproducibility-rule.typ b/resources/typst/reproducibility-rule.typ new file mode 100644 index 0000000..461e3b7 --- /dev/null +++ b/resources/typst/reproducibility-rule.typ @@ -0,0 +1,21 @@ +#{ + set text(font: "Virgil 3 YOFF") + image("../../resources/images/rules.svg", fit: "stretch") + v(-1.5em) + grid( + columns: (1fr, 1fr, 1fr), + { + set align(left) + "Not reproducible" + }, + { + set align(center) + "Partially reproducible" + }, + { + set align(right) + "Reproducible" + }, + ) + v(2em) +} diff --git a/resources/typst/scientific-method-w-r13y.typ b/resources/typst/scientific-method-w-r13y.typ new file mode 100644 index 0000000..468b03f --- /dev/null +++ b/resources/typst/scientific-method-w-r13y.typ @@ -0,0 +1,15 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#render( + read("../../resources/graphviz/scientific-method-with-reproducibility.dot"), + width: 100%, + labels: ( + step0: [#text(font: "Virgil 3 YOFF", size: .95em)[Observation]], + step1: [#text(font: "Virgil 3 YOFF", size: .95em)[Question]], + step2: [#text(font: "Virgil 3 YOFF", size: .95em)[Hypothesis]], + step3: [#text(font: "Virgil 3 YOFF", size: .95em)[Prediction]], + step4: [#text(font: "Virgil 3 YOFF", size: .95em)[Testing]], + step5: [#text(font: "Virgil 3 YOFF", size: .95em)[Analysis]], + step6: [#text(font: "Virgil 3 YOFF", size: .95em)[Repeat]], + ), +) diff --git a/resources/typst/scientific-method.typ b/resources/typst/scientific-method.typ new file mode 100644 index 0000000..3b23f1a --- /dev/null +++ b/resources/typst/scientific-method.typ @@ -0,0 +1,15 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#render( + read("../../resources/graphviz/scientific-method.dot"), + width: 100%, + labels: ( + step0: [#text(font: "Virgil 3 YOFF", size: .95em)[Observation]], + step1: [#text(font: "Virgil 3 YOFF", size: .95em)[Question]], + step2: [#text(font: "Virgil 3 YOFF", size: .95em)[Hypothesis]], + step3: [#text(font: "Virgil 3 YOFF", size: .95em)[Prediction]], + step4: [#text(font: "Virgil 3 YOFF", size: .95em)[Testing]], + step5: [#text(font: "Virgil 3 YOFF", size: .95em)[Analysis]], + step6: [], + ), +) diff --git a/src/thesis/1-introduction.typ b/src/thesis/1-introduction.typ new file mode 100644 index 0000000..49b6b82 --- /dev/null +++ b/src/thesis/1-introduction.typ @@ -0,0 +1,391 @@ +#import "imports/preamble.typ": * +#import "theme/template.typ": * +#import "theme/common/titlepage.typ": * +#import "theme/common/metadata.typ": * +#import "theme/disclaimer.typ": * +#import "theme/leftblank.typ": * +#import "theme/acknowledgement.typ": * +#import "theme/abstract.typ": * +#import "theme/glossary.typ": * +#import "theme/infos.typ": * + +#chapterquote( + title: "Introduction", + ref: "chapter1", + quoteAttribution: , + quoteText: [ + Sine experientia nihil sufficienter sciri potest + ], +) + +== History + +@Bacon1928, an English philosopher and scientist, articulated in 1267 the +foundations of what we today recognise as "reproducibility". He famously stated +in Latin #quote("Sine experientia nihil sufficienter sciri potest") which means +#quote("Without experience nothing can be sufficiently known") +#cite(form: "normal", supplement: [p.583], ). He was among the first +to underscore the significance of repeated experimentation as a means to test +and ultimately confirm scientific findings. Although the specific term +"reproducibility" was not used in his time, his emphasis on empirical evidence +is seen as a precursor to our modern understanding of reproducible research in +the scientific method. + +Centuries later, @kpopper1934, an Austrian-British philosopher wrote a book on +the importance of falsifiability in the scientific method. He argued that a +scientific theory must be falsifiable in order to be considered valid. He also +introduced the concept of #emph[falsificationism], which states that a theory +can only be considered scientific if it is possible to conceive of an +observation or an argument which proves the theory false. This principle is now +widely accepted as a fundamental tenet of the scientific method. + +@Thompson84 delivered a lecture "Reflections on Trusting Trust" at the +Association for Computing Machinery (ACM) Turing Award Banquet. Since his talk, +the landscape of software has undergone a radical metamorphosis. The simplicity +of his advice to #emph[trust people] has become significantly more complex +in the current era. The modern software supply chain is extensive, often +encompassing dependencies the often lies beneath the surface. Despite the +prevalence of open-source software within this supply chain, it is uncommon for +end-users to compile their own software. As a result, the build systems became a +prime vector for malicious exploitation +#cite(, form: "normal", supplement: [p.1]). This underscores +the vital importance of software reproducibility which ensures that software can +be reliably built and verified from its source across different environments and +over time, mitigating the risks associated with trust and the potential for +exploitation within the software supply chain. + +@Claerbout1992 wrote about the challenges and implications of reproducibility in +the paper titled "Electronic documents give reproducible research a new +meaning". This work was in the field of geophysics, but it has been influential +across multiple domains of #gls("CS"). It was one of the early works to +emphasise that the sharing of software and environments is critical to the +reproducibility of computational results. + +@Collberg2012 has underscored the importance of reproducibility in #gls("SE") by +advocating for and funding initiatives that enhance both repeatability and +reproducibility. They promote the adoption of standardised practices and +transparency in research, which are crucial for ensuring that experiments are +repeatable, a foundational aspect of reproducibility. By encouraging +comprehensive documentation and public sharing of methodologies, data, and code, +they facilitate the replication of work by others, thus enhancing the overall +reliability and verification of scientific findings. Their financial support +extends to tools and infrastructure that assist in establishing repeatability, +which is paramount to achieving reproducibility in broader research. Recognising +and rewarding efforts to share research artefacts further embeds a culture where +both repeatability and reproducibility are fundamental practices in #gls("SE"), +ensuring that studies can not only be repeated under the same conditions but +also reproduced and validated in different contexts. + +In 2020, the United States sustained a sophisticated cyberattack known as +SolarWinds. This meticulously orchestrated campaign apparently attributed to +Russia persisted undetected for several months and was enabled through a +backdoor embedded within one of the dependencies of the SolarWinds Orion +(@solarwinds-9579611) software, a network monitoring tool employed by numerous +corporations and government agencies. By exploiting this vulnerability, the +attackers gained unauthorised access to the networks of SolarWinds clients, +which allowed them to steal data and deploy malware. + +This incident raises a question "How did such a consequential security breach +remain unnoticed for months?" and underscores the critical importance of +reproducibility in #gls("SE"). Had the SolarWinds Orion software and its +dependencies been subject to stringent reproducibility standards, where every +build could be precisely recreated and examined, the malicious alterations might +have been detected earlier. Reproducibility in this context not only refers to +the ability to replicate software builds but also to verify the integrity and +provenance of every component, ensuring that no unauthorised changes have been +made. By prioritising reproducibility, developers and users of software can +enhance security measures, mitigate risks, and foster a more trustworthy digital +environment. + +In a post from the @joinupSignal, institutions like the European Commission and +the European Parliament began recommending Signal +#cite(, form: "normal"), a secure open-source instant messaging +platform, for communications. This highlighted a broader, critical issue: the +verification of software authenticity. Users generally trust software obtained +from various stores, but this trust brings to the forefront the question of how +they can verify with certainty that the version of the Signal application +installed on their devices indeed derives directly from the source code provided +in its repository. Ensuring an application's authenticity, confirming it has not +been tampered with prior to its public release, has become a central concern in +our everyday lives. This issue underscores the importance of transparent and +reproducible builds where the end product can be reliably traced back to its +original source, ensuring the integrity and security of the software being used. + +During the OpenAI DevDay's keynote #cite(, form: "normal") +in November 2023, OpenAI's CEO Sam Altman unveiled a groundbreaking feature +called #emph[reproducible outputs]. This innovation enables users to +consistently replicate the outcomes generated by OpenAI's models, marking a +significant advancement in achieving reproducibility within the realm of +artificial intelligence. + +Software companies have increasingly recognised the importance of +reproducibility for enhancing security. Popular messaging platforms like Signal +#cite(, form: "normal") and Telegram +#cite(, form: "normal") have taken significant steps to +ensure the reproducibility of their builds. They support reproducible builds, +allowing users to verify that the open-source code matches the applications +available on various app stores, including the Apple Store and Google Play. This +initiative ensures that the distributed binaries are authentic and unaltered, +thereby protecting users from potential vulnerabilities and enhancing trust in +the software's integrity. By implementing these procedures, software companies +highlight the broader industry's move towards transparency and reliability in +software distribution, reinforcing the essential role of reproducibility in +modern #gls("SE"). + +In the light of these considerations, this master thesis addresses the +aforementioned issues and questions by delving into the principles of software +reproducibility. We will explore the mechanisms that can ensure the integrity of +software, and examine how these practices can be standardised to safeguard +against the risks of unauthorised tampering. In doing so, this thesis aims to +contribute to the critical discourse on software security and reliability in an +era where digital trust is paramount. + +== Background + +Curiosity has always been at the core of my being, fuelling an insatiable +eagerness to learn and explore the unknown. I remember myself as an inquisitive +boy, constantly delving into the mechanics of how things work. This curiosity +often led to disassembling devices to uncover their hidden secrets, followed by +a harder quest to reassemble them. While my father played a pivotal role in this +journey of discovery, and I simply cannot recall a single moment when he +responded with #emph["I don't know"] it was my mother who truly ignited my +path to computers. Her encouragement and unwavering belief in pursuing my +passions were instrumental in shaping my journey towards a deeper understanding +of the world around me, especially computers. + +My interest in computers and software development was sparked in my early +childhood, before the age of ten with the Logo language, which soon led to my +discovery of the BASIC programming language on an Atari 1040STE. The +capabilities of that machine captivated my young mind, igniting a passion for +technology and its boundless potential. This early fascination was a signpost +towards my future; it was clear that my career would be intertwined with +computers. Over the years, I witnessed the remarkable evolution of the software +industry and the advent of groundbreaking technologies. I have also observed the +progression in software development methodologies. However, despite the influx +of new technologies arising, there are certain categories of issues that remain +constant along the passing years, sadly. + +Transitioning from BASIC, I briefly jumped on Microsoft Windows before moving to +Linux, a platform that has since become my daily driver. In 2019, I found myself +grappling with a sluggish laptop running a popular Linux distribution. In search +of a faster, binary-based alternative, I transitioned to NixOS +#cite(, form: "normal"). This shift marked the beginning of a totally new +perspective on software development for me. It was through NixOS that I +encountered the concept of "reproducibility" which opened my eyes to the +possibilities of making and shipping more reliable software. + +We've seen in the previous section and will detail further in the next chapters +that this concept originates from researchers and the scientific method. This +concept can be transposed to #gls("CS") and more specifically to software +development. In this context, reproducibility is the ability to recreate the +exact same software, including the operating system, the compiler, the +libraries, and the source code, in order to obtain the same results. + +For the past three years, the principle of reproducibility has totally +revolutionised my approach to software development. This concept has captivated +me to such an extent that I now devote a significant portion of my free time to +contributing to open-source projects that emphasise reproducible builds. It is +this profound interest that has inspired me to dedicate my master's thesis to +exploring the depths and implications of reproducibility in #gls("SE"). + +Here's a non-exhausive list of projects I have contributed to: +- In the Linux NixOS operating system: + - I created around 430 pull requests #cite(, form: "normal"). + - I made around 1800 reviews #cite(, form: "normal"). + - After several months of dedicated effort, I developed a wrapper for building + reproducible Composer-based PHP applications + #cite(, form: "normal"), resolving a significant + obstacle and positioning Nix as the preferred distribution for self-hosting + PHP applications. An updated version is in preparation + #cite(, form: "normal"), which will be more + user-friendly and will provide a more comprehensive solution for PHP + developers while being at least twice faster than the previous version. + +- In the PHP #cite(, form: "normal") community: + - In Composer #cite(, form: "normal"), the PHP package manager, I + proposed a pull request enabling deterministic outputs by default + #cite(, form: "normal"). + - I advocate for reproducibility by giving talks + #cite(, form: "normal"). + - I open issues in PHP projects that are not shipping required files to enable + reproducibility, explaining the reasons why it should be included: + - In PHPUnit, the PHP testing framework + #cite(, form: "normal")\; + - In PsySH, a PHP #gls("REPL", long: false) + #cite(, form: "normal")\; + - In GrumPHP, a code quality tool + #cite(, form: "normal")\; + - In Psalm, a static analysis tool + #cite(, form: "normal")\; + - In PHPMD, a static analysis tool + #cite(, form: "normal")\; + - In PHP-CS-Fixer, a code formatter + #cite(, form: "normal")\; + - In PHP-Parallel-Lint, a code linter + #cite(, form: "normal")\; + - I initiated and participated in discussions to improve reproducibility in + the PHP source code #cite(, form: "normal"). +- In the Reproducible Builds #cite(, form: "normal") + project: + - I contributed to the website by making it reproducible + #cite(, form: "normal")\; + - I improved the documentation #cite(, form: "normal") + #cite(, form: "normal")\; + - I contributed to the monthly reports + #cite(, form: "normal")\; +- In the Typst #cite(, form: "normal") project: + - I raised awareness about the importance of reproducibility + #cite(, form: "normal")\; + - I engaged in discussions on Discord leading to improving the compilation + environment hermeticity + #cite(, form: "normal")\; +- At work, I advocate the cause of reproducibility, emphasising its critical + importance in our projects. The objective is to raise awareness amongst my + colleagues about the advantages of reproducibility, with the ultimate aim of + establishing it as a norm within our organisational software practices. As an + initial measure, I am developing proofs of concept that illustrate the + process of creating reproducible containers, embedding their #gls("SBOM") + within their metadata. Additionally, I pioneered a project focused on + generating ephemeral, reproducible, and tailor-made development environments + and user profiles #cite(, form: "normal"). Finally, I try to + provide reproducible development environments for each open-source projects + #cite(, form: "normal") we publish to foster a more transparent + and secure software development process but also to encourage contributions. +- In 2022, I participated in the #emph[Summer of Nix], a paid summer program + designed to foster learning, networking, and collaboration within the Nix + community. The program caters to both experienced Nix users and newcomers, + offering a unique opportunity to work together on a diverse range of topics. + During this event, I did a talk about how we use Nix at work + #cite(,form:"normal"). +- In a recent YouTube interview on "La Tronche En Biais" + #cite(, form: "normal") , in a live titled "SCIENCES: Une crise de + reproductibilité des études?" #cite(, form: "normal"), I + briefly discussed the reproducibility crisis in scientific studies and drew + parallels with #gls("SE"). I shared insights from my master's thesis on this + topic, particularly highlighting the challenges faced when software compiled + in one environment fails in another, highlighting broader implications for + security and consistency across different systems. I also clarified different + levels of reproducibility introduced in @ch2-r13y-levels. + +== Motivation + +The pursuit of reproducibility in #gls("SE") is driven by a +fundamental quest for precision, reliability, and trust in the digital +landscape. In an age where software pervades nearly every aspect of our personal +and professional lives, the importance of being able to reliably replicate +software builds cannot be overstated. + +At its core, software reproducibility addresses a simple yet profound question: +can we consistently recreate the same software product, with the same +functionality and performance, across different environments and over time? This +question is not just academic but is deeply rooted in practical necessities and +ethical considerations in the field of #gls("CS"), but not only. + +The principle of reproducibility is essential across various disciplines, +ranging from the empirical rigor of scientific experiments to painting or even +culinary art. This concept, at its core, is about the ability to consistently +replicate results under similar conditions. + +In cooking, recipes passed down through generations serve as blueprints for +recreating cherished family dishes. Despite meticulously following these +recipes, achieving the exact taste and texture of an ancestral meal can +sometimes be elusive. Factors like cooking temperature, ingredient quality, or +even altitude can possibly alter the outcome. This uncertainty in replicating +results underscores the complexity and nuances involved in the process of +reproducibility. + +In #gls("CS"), the implications of reproducibility take on a more systemic and +critical dimension. Imagine wanting to use or build a software application, +ensuring it is identical to what the original developer intended. For instance, +this thesis itself is a digital artefact, a #gls("PDF") document derived from +source files hosted in a public repository. A pertinent question arises: how can +one be sure that the document produced from the source code today will be +identical to one compiled a year from now? Ensuring reproducibility in such +cases is not just a matter of convenience but a cornerstone for verifying +authenticity and integrity in a digital world increasingly prone to +misinformation and security threats. + +Reproducibility in #gls("SE") lies in its potential to enhance reliability and +security. Reproducibility aims to eliminate the all-too-common refrain of +#emph["it works on my machine!"] by establishing a more robust, consistent build +and deployment process. It is about creating a development environment where +software, when operational on one machine, can be expected to be built and +function identically on another, thereby bridging gaps in consistency and +predictability. In a digital era where trust and security are paramount, +reproducibility is not merely a technical objective; it is a fundamental +criterion for building and maintaining digital trust. + +One of the driving factors behind this research is the inherent complexity +present in modern software environments. Today's software systems are built on +intricate layers of dependencies, including various libraries, frameworks, and +operating systems. This complexity poses significant challenges in ensuring +consistent behaviour of software products across different environments. +Moreover, the escalating frequency of security breaches and malicious attacks on +software supply chains underscores the critical role of reproducibility. It +serves as a vital mechanism for verifying the integrity of software, assuring +that it has not been compromised, and maintaining the transparency of the build +process. + +Furthermore, the academic and scientific rigors of #gls("CS") demand a steadfast +commitment to reproducibility. In a field where building upon previous work is +the norm, the ability to validate and replicate research findings is +indispensable. This aspect is particularly crucial in open-source software +development, which thrives on community collaboration. The open-source paradigm +hinges on the capability of developers around the world to replicate, modify, +and contribute to codebases consistently and efficiently. + +Lastly, the evolving nature of software poses its own set of challenges. +Software development is a dynamic process, with systems continually evolving and +adapting. Maintaining reproducibility ensures that earlier versions of software +can be reliably reconstructed and understood, a critical factor for long-term +maintenance, auditing, and compliance. + +Through this thesis, the aim is to shed light on the significance of software +reproducibility, exploring how it can be effectively achieved and the tools and +practices that can facilitate this goal. This exploration is not only crucial +for the technological advancement but also for upholding the principles of +reliability, security, and transparency in an increasingly software-dependent +world. + +== Goals + +In this master thesis, my primary focus will be to provide a comprehensive +overview of reproducible builds, within the sphere of software development, +acknowledging that a complete examination of every aspect of reproducibility is +beyond our scope. + +I will explore a selection of tools and methodologies that promote +reproducibility or, at least, create favorable conditions for facilitating it. +Moreover, this document is intended to enlighten and hopefully convince the +reader that the construction of reproducible software should be a fundamental +principle, not merely a secondary consideration, within the software development +lifecycle. Finally, I will delve into the rationale for adopting this +reproducibility paradigm as a standard practice in modern #gls("SE"), with +particular emphasis on security implications. + +By the conclusion of this thesis, the reader will have a comprehensive +understanding of the concept of reproducibility and how best practices can be +implemented effectively in software development projects. + +== Structure + +Organised into several chapters, this thesis systematically explore the +multifaceted nature of software reproducibility. + +- @chapter1 being this introduction, outlining the thesis's scope and + objectives. +- @chapter2 introduces the origin of the concept of reproducibility, tracing its + lineage from scientific principles. It proposes a terminology, formalisms and + its challenges. +- @chapter3 is a hands-on exploration, delving into specific real-world + examples. It will demonstrate practical applications of the concepts discussed + in previous chapters, including proof of concept implementations, concrete + case studies, and detailed analyses of real-world scenarios where + reproducibility plays a crucial role. This chapter aims to bridge theory with + practice, showing how the principles of reproducibility are applied and + sometimes challenged in real-world settings. +- The final @chapter4 synthesises the insights gained throughout the thesis. It + offers recommendations for best practices based on the research and + discussions presented. Moreover, it suggests directions for future work, + identifying areas where further research, development, and discussion are + needed to advance the field of software reproducibility. diff --git a/src/thesis/2-related-work.typ b/src/thesis/2-related-work.typ new file mode 100644 index 0000000..0f450d0 --- /dev/null +++ b/src/thesis/2-related-work.typ @@ -0,0 +1,11 @@ +#import "imports/preamble.typ": * +#import "theme/template.typ": * +#import "theme/common/titlepage.typ": * +#import "theme/common/metadata.typ": * +#import "theme/disclaimer.typ": * +#import "theme/acknowledgement.typ": * +#import "theme/abstract.typ": * +#import "theme/glossary.typ": * +#import "theme/infos.typ": * + += Related work diff --git a/src/thesis/2-reproducibility.typ b/src/thesis/2-reproducibility.typ new file mode 100644 index 0000000..0730779 --- /dev/null +++ b/src/thesis/2-reproducibility.typ @@ -0,0 +1,1773 @@ +#import "imports/preamble.typ": * +#import "theme/template.typ": * +#import "theme/common/titlepage.typ": * +#import "theme/common/metadata.typ": * +#import "theme/disclaimer.typ": * +#import "theme/leftblank.typ": * +#import "theme/acknowledgement.typ": * +#import "theme/abstract.typ": * +#import "theme/glossary.typ": * +#import "theme/infos.typ": * +#import "theme/definition.typ": * + +#chapterquote( + title: "Reproducibility", + ref: "chapter2", + quoteAttribution: , + quoteText: [ + Reproducibility is a minimum necessary condition for a finding to be + believable and informative. + ], +) + +== Reproducibility in Science + +#info-box(kind: "cite", footer: [@kpopper1934])[ + No serious physicist would offer for publication, as a scientific discovery, + one for whose reproduction he could give no instructions. +] + +The concept of reproducibility lies at the heart of scientific inquiry, serving +as a critical benchmark for the validation and acceptance of research findings. +It is a principle that transcends scientific disciplines, insisting that the +results of an experiment or study must be consistently replicable under +identical conditions by different researchers. This aspect of the scientific +method ensures the reliability and integrity of scientific knowledge. It +establishes a framework where hypotheses are not just tested but also subjected +to repeated verification, underpinning the trust and credibility that society +places in scientific discoveries. The journey of reproducibility, originating +from the earliest scientific endeavors, has evolved to adapt to the complexities +and nuances of modern research methodologies. This evolution mirrors the +progression of scientific thought and technology, from rudimentary experiments +to sophisticated, computer-assisted analyses. + +One can observe the glimpse of the first traces of this concept in @kpopper1934. +The concept of reproducibility is far from new and has been a cornerstone in the +sciences for centuries. It aims to explain natural phenomena in an objective and +repeatable manner. + +According to @Castillo1669, the scientific method (@scientificmethod), a +formalised and widely-adopted process for exploring observations and answering +questions, is inherently designed to be repeatable. However, this does not +guarantee that the results of all experiments conducted using the scientific +method will be reproducible. When results cannot be replicated, it raises +questions about the validity of the experiment and the credibility of the +researcher. + +#info-box[ + In the realm of scientific research, #emph[repeatable] and #emph[reproducible] + are terms often used interchangeably, yet they hold distinct meanings. + + #emph[Repeatable research] refers to the ability of a study or experiment to + yield the same results when conducted again under the same conditions with the + same materials and methods by the same researchers. It primarily focuses on + the consistency and reliability of results within the original research + context. + + On the other hand, #emph[reproducible research] emphasises the ability of an + independent researcher to attain at the same findings and conclusions using + the original study's raw data and following the same methodologies, but + possibly under different conditions and with different tools. Reproducibility + extends the validation process beyond the original researchers, ensuring that + the results hold up under scrutiny and can be reliably used as a foundation + for further study. + + Together, repeatability and reproducibility are foundational to the integrity + and advancement of scientific knowledge, allowing for a deeper trust and + understanding of research findings. +] + +While reproducibility can be considered closely aligned with the scientific +method, it is not an intrinsic part of it. The scientific method is a procedural +approach for conducting experiments, whereas reproducibility is a quality +attribute of the experimental results (@scientificmethodwithreproducibility). + +As of 2016, some of its basic terms were not standardised. This diverse +nomenclature has led to confusion, both conceptual and operational, about what +kind of confirmation is needed to trust a given scientific result. + +#grid( + columns: (1fr, 1fr), + [ + #figure( + include "../../resources/typst/scientific-method.typ", + caption: [Scientific method], + ) + ], + [ + #figure( + include "../../resources/typst/scientific-method-w-r13y.typ", + caption: [Scientific method with reproducibility], + ) + ], +) + +Reproducibility in research is a major factor that determines the uniqueness of +research studies. It means obtaining consistent results using the same data and +protocol as the original study. For example, researchers confirm the validity of +a new discovery by repeating the experiments that produced the original results. +Moreover, other researchers in the field are also able to repeat the same +experiments producing the results similar to the original. + +=== Reproducibility Levels + +According to @ESSAWY2020104753, reproducibility is organised in four levels: + +- *Repeatability*: Achieved upon obtaining consistent results using the same + input data, computational steps, methods, and code on the original + researcher’s machine. This level is normally achieved in scientific papers. +- *Runnability*: Achieved when the author of the research can obtain consistent + results using the same input data, computational steps, methods, code and + conditions of analysis on a new machine. +- *Reproducibility*: Achieved when a new researcher, not an original author of + the analysis, is able to reproduce the analysis in their own computational + environment #cite(, form: "normal"). +- *Replicability*: Achieved by obtaining consistent results across studies aimed + at answering the same scientific question, each of which has obtained its own + data #cite(, form: "normal"). Replicability also allows scientists + not involved in the original study to build from and expand on research once + they are first able to reproduce that research. + +#figure( + include "../../resources/typst/essawy-table.typ", + caption: [The four levels of reproducibility and their requirements.], + kind: "table", + supplement: [Table], +) + +#info-box[ + It's crucial to understand that these levels are interconnected and not + isolated. Achieving reproducibility level means that the criteria for both + repeatability and runnability levels have been met. +] + +@table-levels-of-reproducibility delineates four levels of reproducibility, each +with specific prerequisites. It's important to acknowledge that these levels are +organised in ascending order of difficulty to attain, starting from the simplest +to the most challenging. Consequently, progressing through these levels +necessitates an incremental investment of resources, time, and effort. + +=== Formalisation + +#definition(term: "Experiment")[ + An experiment $e$ conducted with a set of parameters and conditions $p$ where + $r(e,p)$ represents the experiment results, is #emph[reproducible] if and only + if: + + #box[ + $ + & forall e in E, forall e' in R(e), forall p in "par"(e), quad r( + e, p + ) eq r(e', p) + $ + ] + + where + - $E$ is the set of all possible experiments + - $"par"$ is a function defined as $"par": E -> cal(P)(P)$ where $cal(P)(P)$ is the + powerset of $P$, the set of all possible parameters of all possible + experiments + - $R(e)$ is a function defined as $R: E -> cal(P)(E)$ where $cal(P)(E)$ is a powerset + of $E$ that gives for each experiment $e in E$, its set of independent + replications +] + +== Reproducibility in Computer Science + +#info-box(kind: "cite", footer: [@Barba2018])[ + In their vision of reproducible research, readers should be able to rebuild + published results using the author’s underlying programs and raw data. + Implicitly, they are advocating for open code and data. +] + +As we shift our focus from general scientific domains to the realm of +#gls("CS"), the principles of reproducibility undergo a unique transformation. +In our digital era, where computations and algorithms form the backbone of +research, reproducibility challenges and solutions take on new dimensions. The +intricate interplay of software, hardware, and data in #gls("CS") demands a +re-examination and adaptation of traditional reproducibility concepts. This is +where the principles established in the broader scientific community intersect +with the specificities of computing, leading to a distinct and crucial discourse +on reproducibility in the field of #gls("CS"). + +The initial recorded use of the term #emph[reproducible research] in an academic +paper is believed to have occurred in 1992, in a presentation by +@Claerbout1992's team at Stanford, during the Society of Exploration Geophysics +conference. @Schwab2000, the same group of researchers updated their definition +of #emph[reproducibility in computationally oriented research]. + +In @Donoho2009, it is stated that reproducibility depends on open code and data. +The authors define reproducible computational research as that +#emph[ + in which all details of computations, code and data, are made conveniently + available to others. +] + +#definition( + term: "Repeatability (Same team, same experimental setup)", + name: "acm_repeatability", +)[ + The measurement can be obtained with stated precision by the same team using + the same measurement procedure, the same measuring system, under the same + operating conditions, in the same location on multiple trials. For + computational experiments, this means that a researcher can reliably repeat + her own computation. +] + +@Goodman2016 acknowledge the lack of standardisation in foundational terms like +reproducibility, replicability, reliability, robustness, and generalizability. +To address this, they suggest a new lexicon: #emph[Methods Reproducibility] to +align with the original concept of reproducibility as defined by @Claerbout1992 +and @Donoho2009, #emph[Results Reproducibility] corresponding to what @Peng2009 +refers to as replicability, and #emph[Inferential Reproducibility] to denote a +distinct category. + +#definition( + term: "Reproducibility (Different team, different experimental setup)", + name: "acm_reproducibility", +)[ + The measurement can be obtained with stated precision by a different team + using the same measurement procedure, the same measuring system, under the + same operating conditions, in the same or a different location on multiple + trials. For computational experiments, this means that an independent group + can obtain the same result using the author’s own artefacts. +] + +The term #emph[reproducibility] in the context of #gls("CS") has been refined +and explored in many subsequent works and identifying a single #emph[first] +definition can be challenging due to the evolution of the concept over times. +According to @Barba2018, who conducted a detailed article on the terminology +history, the most appropriate terminology (@acm_repeatability, +@acm_reproducibility, @acm_replicability) to describe reproducibility in the +context of #gls("CS") is the definitions derived from +@acm_artifact_review_badging[Artifact Review and Badging]. + +#definition( + term: "Replicability (Different team, same experimental setup)", + name: "acm_replicability", +)[ + The measurement can be obtained with stated precision by a different team, a + different measuring system, in a different location on multiple trials. For + computational experiments, this means that an independent group can obtain the + same result using artefacts which they develop completely independently. +] + +In the context of this document, @reproducibility is the definition of +#emph[reproducibility] that we will use when referring to the concept of +reproducibility in #gls("CS"). + +#definition(term: [Reproducibility], name: "reproducibility")[ + Reproducibility is the ability to consistently obtain identical results across + multiple runs of a computer task when using the same methods and data, + regardless of method, space and time. Note that this does not necessarily + imply that the outputs are correct or the desired outputs. +] + +#info-box(kind: "important")[ + #emph[Space] and #emph[Time] are terms borrowed from physics. In the context + of reproducibility in #gls("SE"), space refers to different systems, while + time refers to different moments in time + #cite(,form:"normal"). (more about that in + @def-deterministic-build). +] + +=== Scope + +In this master thesis, the exploration of reproducibility will focus on a +specific aspect: ensuring the reproducibility of building source code to ensure +that the resulting application works. This critical area is paramount in the +field of +#gls("SE"), where the ability to consistently recreate identical software +artefacts from a given source code under varying conditions and environments +stands as a paramount concern. The intent is not to undermine the importance of +these other facets, but rather to mainly concentrate the efforts and analysis on +the reproducibility of source code building and compilation. + +#info-box[ + We acknowledge that languages like JavaScript, PHP, Python are not compiled + but merely interpreted by their respective interpreter. Often, these scripting + languages require dependencies provided by their respective package manager as + well. Ensuring the availability of these dependencies is an integral part of + the software build process and, to some extent, corresponds to the compilation + in non-compiled languages. Consequently, in the context of this thesis, + "compiling source code" is applicable to both types of programming languages. +] + +The concept of reproducibility can be distinctly categorised into multiples +phases like: #emph[reproducibility at build time] +(@def-reproducibility-build-time) and #emph[reproducibility at run time] +(@def-reproducibility-run-time). It is important to note that these phases are +not mutually exclusive and can be combined to achieve a higher level of +reproducibility. + +#definition( + name: "def-reproducibility-build-time", + term: "Reproducibility at build time", +)[ + Reproducibility at build time refers to the ability to consistently generate + the same executable or software artefact from a given source code across + different builds on different environments, across different space and time. + This aspect is crucial in ensuring that the software compilation process is + deterministic and immune to variances in development environments, compiler + versions, or build tools. It involves a meticulous standardisation and + documentation of the build environment and dependencies to guarantee that the + same executable is produced regardless of when or where the build occurs. +] + +#definition( + name: "def-reproducibility-run-time", + term: "Reproducibility at run time", +)[ + Reproducibility at run time addresses the consistency of software behaviour + and output when the software is executed in different environments or under + varying conditions. This type of reproducibility focuses on ensuring that the + software performs identically and produces the same results regardless of the + #gls("OS"), underlying hardware, or external dependencies it interacts + with during execution. +] + +To illustrate these phases, the C source code in @montecarlo-pi.c implements the +Monte Carlo method to approximate the value of π. This is an example of +reproducibility at build time, but not at run time. + +#figure( + { + sourcefile( + file: "montecarlo-pi.c", + lang: "c", + read("../../resources/sourcecode/montecarlo-pi.c"), + ) + }, + caption: [`montecarlo-pi.c`], +) + +#figure( + { + set text(size: .9em) + shell(read("../../resources/sourcecode/montecarlo-pi-compilation.log")) + }, + caption: [ + Building the same source code multiple times always yields the same binary + executable + ], + supplement: "Terminal session", + kind: "terminal", +) + +The Monte Carlo algorithm is inherently stochastic, it uses random sampling or +probabilistic simulation as a core part of its computation. This randomness is +intrisic to the algorithm's design and purpose. + +The distinction between build time and run time reproducibility for the Monte +Carlo algorithm arises from its usage of a random source. While the algorithm +can be reliably built into a consistent binary (build time reproducibility), its +outputs can vary on different executions under the same conditions due to its +inherent randomness (lack of runtime reproducibility). This does not undermine +the validity of the algorithm but rather is a characteristic of its +probabilistic approach to problem-solving. + +#figure( + shell(read("../../resources/sourcecode/montecarlo-pi.c.log")), + caption: [ + Running the binary multiple times does not always yields the same result + ], + supplement: "Terminal session", + kind: "terminal", +) + +In practice, for certain applications, runtime reproducibility can be attained +by controlling the random number generator, specifically by setting a fixed seed +as an input parameter. + +In the next example, the source code is not reproducible at build time and we +might erroneously think that the program is reproducible at run time. + +We observed that compiling the same source code multiple times results in +different binaries. This variation occurs because the source code includes the +macros `__TIME__` and `__DATE__`, which are substituted with the current time +and date during compilation. As a result, we cannot achieve reproducibility at +build time. + +#figure( + { + sourcefile( + file: "datetime.c", + lang: "c", + read("../../resources/sourcecode/datetime.c"), + ) + }, + caption: [Sourcecode of `datetime.c`, a C program with macros], +) + +Upon executing the produced binary, the outcome appears consistent. This might +suggest that the binary is reproducible at run time, however, this assumption is +incorrect. Consider a scenario where a different user compiles the same source +code. In such a case, runtime reproducibility between the original and another +user is not assured. + +#figure( + shell(read("../../resources/sourcecode/datetime.c.log")), + caption: [ + An example of program that it neither reproducible at build time and + run time. + ], + supplement: "Terminal session", + kind: "terminal", +) + +=== Quantifying Reproducibility + +Quantifying reproducibility is traditionally viewed as a binary state: it is +either reproducible or not. However, this perspective oversimplifies the +complexity of software environments. In reality, reproducibility exists on a +spectrum, where the focus shifts from a mere #emph[yes-or-no] assessment to +evaluating the extent and conditions under which a computation is reproducible. + +#figure( + include "../../resources/typst/reproducibility-rule.typ", + caption: "Reproducibility states", +) + +We will explore this concept with Docker images as a primary example. Docker, a +popular containerization platform, uses Dockerfiles (@dockerfile-example). +Basically, a `Dockerfile` is a script with a set of instructions to build +images. These images are then used to run software in a consistent environment. +However, many images on the Docker Hub #cite(, form:"normal") present +challenges to reproducibility. The reasons vary: some Dockerfiles are not +publicly available but especially because most of them include significant +variability in their build processes, making exact replication of the images +pretty much impossible. We will consider these challenges in more detail in +@chapter4. + +#figure( + sourcefile( + file: "nodejs.dockerfile", + lang: "dockerfile", + read("../../resources/sourcecode/nodejs.dockerfile"), + ), + caption: [An example of `Dockerfile`], +) + +Determining a precise value for a Docker image's temporal reproducibility is +complex. Thus, for the purposes of this thesis, we simplify the classification +into three broad categories as outlined in @reproducibility-rule: +#emph[Not reproducible], #emph[Partially reproducible], #emph[Reproducible]. +While more nuanced classifications are possible, this simplified tripartite +model provides a sufficient basis in this thesis. + +Despite these challenges, Docker images are widely used. They remain static +between updates, creating a window during which their environment is consistent. +This period (the interval between two successive updates) can serve as an +indirect measure of reproducibility. Essentially, the longer the time between +updates, the more stable and, by extension, reproducible the image is +considered. In considering the temporal dimension of reproducibility, it is +essential to recognise that software artefacts are not unchanging entities; they +offer a predictable environment for a finite period. Imagine a scale where 0 +represents non-reproducibility and 1 indicates full reproducibility. On this +scale, the temporal reproducibility of Docker images would be positioned between +0 and 1, acknowledging the nuanced nature of this concept. + +However, this reproducibility is inherently dynamic due to the nature of +software evolution. Each update to a Docker image might introduce changes +affecting the software's behaviour, thereby impacting its reproducibility over +time. Understanding this aspect of temporal reproducibility is crucial for +managing software environments in a continuously advancing technological +landscape. + +The Docker use case can be classified under the #emph[Partially reproducible] +class. This is because, while Docker images ensure reproducibility at run time +by providing a consistent execution environment, they often fall short of +reproducibility at build time due to the variability inherent in their +Dockerfiles. This dichotomy highlights the spectrum of reproducibility, where +Docker images occupy an intermediate position. They are neither fully +reproducible (due to build-time variability) nor completely irreproducible +(thanks to their runtime stability). This categorization not only helps in +understanding the reproducibility status of Docker images but also underscores +the need for a nuanced approach to classifying software reproducibility, +acknowledging the various shades that exist between the #emph[Not reproducible] +and #emph[Reproducible] boundaries. + +=== Open Source + +#info-box(kind: "cite", footer: [@Donoho2009])[ + If everyone on a research team knows that everything they do is going to + someday be published for reproducibility, they’ll behave differently from day + one. Striving for reproducibility imposes a discipline that leads to better + work. +] + +Open Source refers to a type of software whose source code is freely available +for anyone to view, modify, and distribute. This model encourages collaborative +development and sharing, allowing users and developers to improve the software +and adapt it to their needs. + +Open-source software development, known for managing complex projects with high +quality, significantly enhances reproducibility by fostering professionalism and +transparency. Making open-source software reproducible offers numerous +advantages: it streamlines the onboarding of new contributors, improves testing +and feature implementation, ensures transparent build processes, facilitates +security audits, and quickens response times in the dependency supply chain in +case of issues. + +Reproducibility, intrinsically linked with Open Source, is fundamentally an +activity that builds trust (@barba2022definingroleopensource), making it a +leading method for ensuring software can be reliably built and verified by a +diverse global community​​. + +A direct correspondence can be established with the taxonomy of +@ESSAWY2020104753 when considering the process of building software from source +code. This process can be analogised to a scientific experiment. In this +context, the act of an individual building the source code of another developer +on their own machine mirrors the concept of #emph["Reproducibility"] as +described in @table-levels-of-reproducibility. This signifies that the software, +when compiled by different users from its source code, consistently results in +the same executable or software artefact. The transparency inherent in +open-source software is a foundational advantage. Since the source code is +publicly available, it allows researchers to scrutinise how the software +operates, understand how results are generated, and validate the reliability and +accuracy of the software. This level of openness is crucial for replicability +and trust in scientific research. + +Furthermore, open-source software promotes a culture of collaboration and +community involvement. Active communities that grow around open-source projects +contribute to the software’s continual improvement. This community-driven +development leads to the identification and resolution of bugs, thereby +enhancing the software's reliability and, consequently, the reproducibility +outcomes that depend on it. + +A key feature of open-source software is the permissive nature of its licencing, +which, depending on the specific licence, facilitates the reuse and modification +of software without legal or technical barriers. This flexibility is vital for +verifying and replicating studies, as researchers can adapt the software for +their specific needs without restrictions, though some licences may impose +certain conditions. Additionally, open-source development tools provide +excellent record-keeping capabilities, like version control systems (e.g., `git`, +Mercurial, Pijul), enabling researchers to track changes and understand the +context of each update. This aspect is essential for reproducing and validating +research findings. + +Lastly, the open source approach aligns well with the scientific values of +openness and sharing, promoting a culture that values transparency and +reproducibility in scientific inquiry. Moreover, the community-driven nature of +open-source software reduces the risk of obsolescence, ensuring that research +tools remain accessible and up-to-date for future replication efforts. + +In essence, open-source software embodies a framework that is not only conducive +to the scientific pursuit of knowledge but also reinforces the integrity and +sustainability of #gls("SE") through its emphasis on transparency, +collaboration, and adaptability. + +Open-source development, by its nature of allowing anyone to build, verify and +use software, stands out as an effective, if not the best, approach to +bolstering both confidence and safety in software systems. This widespread +participation and verification process inherent in open-source development +contributes significantly to the reliability and security of the software. + +=== Terminology + +Establishing formal definitions and terminology is crucial for aligning +researchers, practitioners, and readers on the same wavelength. By articulating +a clear and precise mathematical representation, we facilitate a universal +understanding of what it means for a computation to be reproducible. + +The following section is dedicated to constructing such a formal definitions, a +balance between the rigor required by the academic community and the clarity +needed for widespread adoption. + +==== Computation + +A computation is a process that involves the execution of algorithms or a series +of operations to obtain a result, usually performed by a computer. It can be +complex, involving multiple steps, conditions, and data manipulations +(@inputs-computation-outputs). The formal definition of computation takes into +account the computational environment variable, reflecting the specific context +where the computation occurs. + +#figure( + include "../../resources/typst/inputs-computation-outputs.typ", + caption: "Inputs, computation, outputs", +) + +In the context of #gls("CS"), defining a computation involves considering the +broader scope of activities and processes that a computer performs, extending +beyond the traditional mathematical abstraction of a function. A computation can +be understood as a sequence of steps or operations performed by a computer to +transform input data into output data. This process can involve various types of +functions, algorithms and data manipulations. Essentially, a computation can be +depicted as an abstraction involving one or multiple functions. + +Examples of computations could be: a program build, a compilation, a program +execution, a data analysis, a data transformation. + +- When source code is compiled, the input is the source code and the output is + the binary executable. +- When a program is executed, the input is the binary and the output is the + result of the program. +- When making a data analysis, the input is the raw data and the output is the + analysis. +- When evaluating a function, method, or procedure in any programming language, + the input consists of the function itself along with its parameters. The + output is the result of the function applied to these parameters, including + any potential side effects (e.g., changes in the program's state). + +#definition(term: "Computation", name: "def-computation")[ + A computation $c$ is a set of one or more functions $f:I times E → R$. + + where + - $I$ is the set of all possible arguments or inputs the computation needs + - $E$ is the set of all possible execution environments (hardware, software, + space, time) + - $R$ is the set of all possible outputs +] + +It is crucial to distinguish functions, both of which are integral in the realms +of Mathematics and #gls("CS"). In Mathematics, a function is a deterministic +construct defining a specific relationship between sets of inputs and outputs, +mapping each input to exactly one output. It acts as a fundamental building +block within computations to describe how values are transformed. In #gls("CS"), +functions are similar but can be classified as pure (@section-pure-function) or +impure (@section-impure-function), with pure functions having no side effects +and impure functions potentially affecting the state or relying on external +variables. While a function provides the rules for individual transformations +within a computation, the computation itself represents the broader and more +dynamic process of achieving a result, often involving the execution of complex +algorithms, data handling, and the application of multiple functions and +operations. + +#grid( + columns: (1fr, 1fr), + gutter: 1em, + [ + #figure( + { + set text(font: "Virgil 3 YOFF") + cetz.canvas({ + import cetz.draw: * + + rect((-3, 3), (3, -3), stroke: white) + circle((0, 0), radius: 2) + content((0, -2.25), "Functions") + content((0, -3.25), text(fill: white, "Environment")) + }) + }, + caption: [Functions in the context of Mathematics], + ) + ], + [ + #figure( + { + set text(font: "Virgil 3 YOFF") + cetz.canvas({ + import cetz.draw: * + + rect((-3, 3), (3, -3)) + circle((0, 0), radius: 2) + line((0, 2), (0, -2)) + content((1, 0), "pure") + content((-1, 0), "impure") + content((0, -2.25), "Functions") + content((0, -3.25), "Computational environment") + }) + }, + caption: [Functions in the context of #gls("CS")], + ) + ], +) + +In #gls("CS") (@functions-in-cs), a function necessitates an environment in +which it will be evaluated, effectively making, to some extent, this environment +an extra input parameter per se. This computational environment, which +encompasses the hardware (e.g., filesystem, memory, #gls("CPU", long: false)), +software (e.g., #gls("OS", long: false)) and date (e.g., the current date and +time), may influence the function's behaviour and output. Consequently, +functions in #gls("CS") are inherently designed to interact with and adapt to +their environment, thereby making them dynamic and versatile but also +potentially non-deterministic. + +Conversely, in Mathematics (@functions-in-mathematics), a function is evaluated +independently of any environment, or with the environment variable effectively +set to null, ensuring its behaviour is entirely predictable and self-contained. +This means its behaviour is entirely predictable and self-contained. This +distinction highlights the adaptability and complexity of functions in +computational contexts, compared to their more stable and defined mathematical +equivalents. + +==== Inputs and Outputs + +An input is the data provided to a computation. The output is the result of a +computation or any other changes made to the environment the computation is +being evaluated in. + +#definition(term: "inputs and outputs", name: "def-inputs-outputs")[ + The function $f: I -> R$ is a function mapping the domain input set $I$ on the + codomain output set $R$. +] + +Inputs and outputs can vary widely, ranging from user interactions and network +connections to files and directories. The nature of these inputs and outputs +significantly impacts the reproducibility of computational processes. + +Consider user interactions, such as mouse or eyes movements. These are +inherently challenging to replicate precisely due to their dynamic and +unpredictable nature. For instance, reproducing the exact trajectory of a mouse +movement is virtually impossible due to the minute variations in human actions. +However, a more reproducible approach would be to capture these interactions in +a structured format like in `eyeScrollR` (@Larigaldie2024). Recording the +coordinates of mouse movements over time in a file creates a detailed log that +can be replayed. This arbitrary method transforms a non-reproducible user +interaction into a reproducible set of data. + +For a computation to be considered reproducible, its inputs and outputs must be +storable and retrievable. Typically, the most feasible types for such storage +are files or directories, primarily due to the ubiquity and accessibility of +file systems in computing environments. Files and directories offer a stable and +widely accessible medium to store and retrieve data. + +In this thesis, the focus will be on scenarios where inputs and outputs are in +the form of files, unless specified otherwise. This assumption aligns with the +common practices in computational processes and aids in maintaining the +reproducibility of the computations discussed. + +In the context of software compilation, an output is correct when it faithfully +reflects the state of its transitive inputs. Basically, the output represents +all direct and indirect dependencies used in the build process. +"Transitive inputs" refer to not only the direct inputs (e.g., source code) but +also to the inputs of those inputs (e.g., libraries, frameworks, compilers, data +resources). + +From the point of view of the software build process as shown in +@inputs-outputs-part1, the inputs are all the source code files, configuration +files, and dependencies required to build the software. + +#figure( + include "../../resources/typst/inputs-and-outputs-part1.typ", + caption: "Inputs, computation, outputs", +) + +In @inputs-outputs-part2, the process has been refined from the perspective of +the user running the software, where the input is now composed of the program +and its parameters. This distinction is crucial as it highlights the dynamic +nature of computational processes. The user's interaction with the software, +such as providing parameters or executing commands, is integral to the inputs +and can significantly influence the output. + +#figure( + include "../../resources/typst/inputs-and-outputs-part2.typ", + caption: "The input is now composed of the program and its parameters", +) + +In @inputs-outputs-part3, the environment where the computation is evaluated is +added to the input. This environment includes the hardware, software, space, and +time in which the computation is executed. This addition further refines the +definition of inputs and outputs, emphasising the dynamic and context-dependent +nature of computational processes. + +#figure( + include "../../resources/typst/inputs-and-outputs-part3.typ", + caption: [ + The input is now composed of the program and its parameters and the + environment where it is going to be evalued. + ], +) + +We could break down the environment further. However, as we delve deeper into +segmenting the components essential for a computation, the process becomes +increasingly subjective (@hinsenKonrad2020guix). + +Reproducibility implies to compare outputs to determine if they are equivalent. +According to @Acm2018[p.5], there are multiple equivalence classes: + +#figure( + include "../../resources/typst/equivalence-classes-of-reproducibility.typ", + caption: [Classes of reproducibility], + kind: "table", + supplement: "Table", +) + +- Two natural phenomena could be observed by human experts and considered as the + same. +- Two results could be statistically equivalent, in that the numeric values are + different, but they both convey the same statistical interpretation. +- Two results could be the same data in the sense that they encode the same + numeric contents, but differ in some irrelevant detail. For example, an output + file might incidentally contain the system time and the name of the user who + ran the program. +- Two results could be equivalent, in every way, bit-per-bit. This is the + strictest form of equivalence. + +In the context of this thesis, we will assume that two results are equivalent if +they are the same, bit-per-bit. + +#info-box(kind: "important")[ + It is important to clarify that in the context of reproducibility, the time + taken to compute the output is not typically considered. This means that two + results can be deemed equivalent or reproducible even if the computational + time to achieve these results varies. For instance, consider a situation where + a piece of code is refactored: if the output data remains unchanged, the + process is considered reproducible from a data consistency perspective. + Nonetheless, even if the refactored code requires significantly more time and + resources to execute, it is still classified as reproducible as long as the + output remains consistent with the original. + + This distinction underscores that reproducibility focuses on the consistency + of the output data rather than the performance or efficiency of the + computational process. This aspect is particularly relevant in environments + where hardware or system efficiencies may differ, yet the integrity and + equivalence of the output data remain the primary concern. While this might + provoke debate regarding resource efficiency and computational time, for the + purposes of this thesis, it is assumed that the temporal and resource aspects + of computing the output are secondary to the consistency of the results + themselves. +] + +==== Evaluation of a Computation + +The evaluation of a computation is the process of determining the resulting +output of a function for a given set of arguments. It involves applying, in a +specific computational environment, the function's defined operations to the +inputs to produce an output. This does not necessarily imply that the outputs +are correct. Note that, #emph[correct] means that the evaluation has +successfully completed without errors. + +#figure( + include "../../resources/typst/inputs-and-outputs-part4.typ", + caption: [ + The evaluation of inputs into outputs where the input is composed of the + program and its parameters and the environment where it is going to be + evaluated. + ], +) + +#definition(term: "Evaluation", name: "def-evaluation")[ + $"eval": (F, I, E) -> R$ is a function that evaluates a function $f$ and its + parameters $i$ in a specific computational environment $e$ to produce a + result, an output. + + $forall f in F, forall i in I, forall e in E, quad "eval"(f,i,e) eq f(i, e)$ + + where + - $F$ is the set of all possible computations + - $I$ is the set of all possible arguments the computation needs + - $E$ is the set of all possible execution environments (hardware, software, + space, time) + - $R$ is the set of all possible outputs +] + +In the realm of mathematics, a function is typically isolated, operating solely +on its provided arguments, with no external environmental factors influencing +its output. Contrarily in #gls("CS"), it is quite common for a computation to +interact with, and be influenced by, its surrounding environment during +evaluation, which necessitates @def-evaluation. + +==== Pure Function + +As seen in @functions-in-mathematics, the concept of a #emph[pure function] as +defined in @def-pure-function does not explicitly exist in Mathematics. This is +because functions are always inherently considered to be deterministic and +side-effect free, here functions in maths are #emph[by default] pure. Any +mathematical function evaluates under the assumption that given the same inputs, +the output will always be the same, and the evaluation of the function does not +alter any external state or variable. + +#definition(term: "Pure function", name: "def-pure-function")[ + A pure function can be defined as a function where the same input always + yields to the same output. + + Let $f: I times E → R$ be a function. Then $f$ is *pure* if and only if: + + $forall i in I, forall e_1, e_2 in E, quad "eval"(f,i,e_1) eq "eval"(f,i,e_2)$ + + where + - $I$ is the set of all possible inputs arguments + - $E$ is the set of all possible execution environments (hardware, software, + space, time) + + A bridge can be drawn between the mathematical definition of a function + $f: I -> R$ and this definition by considering the environment variable $E$ as + an empty set, making the function independent of any external state or + variable. This effectively reduces the definition of a pure function in + #gls("CS") to the mathematical definition of a function. +] + +However in #gls("CS"), it makes sense to define what are pure and impure +functions because a function might behave differently depending on the +environment in which it is executed. Therefore, the purity of a function in the +context of #gls("CS") is vital for understanding and managing side effects +and state in software, it is a distinction that doesn't apply in the static, +deterministic realm of pure mathematics. + +This distinction highlights how the same term can have different implications in +different disciplines, reflecting the unique nature of challenges and concepts +in programming versus pure mathematics. However, we can still try to define such +a function in a theoretical #gls("CS") context. + +A pure function is a specific type of function in programming characterised by +the following properties: + +- Deterministic: for a given set of inputs, a pure function always returns the + same output. This means the function's output is solely determined by its input + values and does not rely on any external state or data. + +- No side effects: A pure function does not cause any observable side effects in + the system. This means it does not modify any external state, global variables, + or data outside its scope. It also does not produce outputs other than its + return value, such as printing to the console or altering the state of the + program beyond the scope of the function. + +#info-box[ + A checksum(@checksum) is an example of pure function. It will constantly + return the same output for the given output. +] + +==== Impure Function + +An impure function is the opposite of the above definition of a pure function. + +#definition(term: "Impure function", name: "def-impure-function")[ + An impure function is a function that does not always yields the same output + for a given input. This can be formally expressed as: + + Let $f: I times E → R$ be a function. Then $f$ is *impure* if and only if: + + $forall i in I, exists e_1, e_2 in E, quad "eval"(f,i,e_1) eq.not "eval"( + f,i,e_2 + )$ + + where + - $I$ is the set of all possible inputs arguments + - $E$ is the set of all possible execution environments (hardware, software, + space, time) +] + +It is a specific type of function in programming characterised by the following +properties: + +- Non-deterministic: the function can yield different outputs for the same set + of input values at different times, depending on the state of the system or + environment in which it is executed. + +- Side effects: the function performs actions that modify some state outside its + local environment or has observable interactions with the outside world. This + can include altering global variables, modifying input arguments, I/O + operations, or calling other impure functions. + +As seen in @functions-in-cs, this concept only exists in programming, as it is a +direct consequence of the mutable nature of the state in programming. In pure +mathematics, functions are conceptualised as mappings from elements of one set +(the domain) to elements of another set (the codomain), without any side effects +or external dependencies. This distinction highlights the difference between the +theoretical framework of mathematics and the practical aspects of programming, +where functions often interact with a mutable state or environment. + +Given this, we will allow ourselves to define such a function in the theoretical +context of computer science. Implying that to define such a function, an +additional parameter, which will be used to calculate the time, #emph[must] be +passed as a parameter to the function. This parameter corresponds to the `E` +parameter in @def-impure-function. + +Given this, we will allow ourselves to define such a function within the +theoretical context of #gls("CS"). + +#info-box[ + An example of an impure function is one that returns the current date and + time, as its output depends on external state and can vary with each call. +] + +==== Checksum + +Although understanding the concept of a checksum is not essential for +understanding the definitions, it is crucial to define it due to its recurring +presence in the next chapters. + +A checksum is the result of a computation. It is a one-way pure function which +takes an input of an arbitrary size and returns a string of a fixed size, +depending on the checksum algorithm in use. For example when using a `git`, each +commit ID is a checksum of the current commit's content and the previous +commit's ID. + +#figure( + include "../../resources/typst/figure-checksum.typ", + caption: "Inputs, checksum, string output", +) + +A one-way function is easy to compute but is practically impossible to reverse. +This is mostly due to the fixed size output, the number of possible inputs +(#emph[domain]) exceeds the number of possible outputs (#emph[codomain]). The +time complexity of such a function is usually linear, which means that the time +it takes to compute the checksum is proportional to the size of the input, +therefore $cal(O)(n)$. + +A checksum is a function that returns a string called #emph[hash], which is +supposedly unique for a given input. Checksum algorithms are designed to +produce a unique hash for each unique input. However, the term "supposedly +unique" is used because, in theory, it is possible for two different inputs to +produce the same hash, an occurrence known as a #emph[collision]. The ability to +find collisions undermines the security of the algorithm. There are different +types of algorithms to calculate a checksum +(e.g., #gls("MD5", long: false), #gls("SHA1", long: false), +#gls("SHA2", long: false)). Older algorithms like #gls("MD5", long: false) have +known vulnerabilities that allow collision attacks while more modern algorithms +like SHA-256 (#gls("SHA2", long: false)) are currently considered to be pretty +much impossible to crack. + +// TODO: Add bibtex ref in glossary for checksum algorithm + +While the mathematical theory allows for the possibility of collisions in +checksum hashes, the reality of their application in modern checksum algorithms +is substantially different. The sophisticated design of these algorithms +significantly reduce the likelihood of such occurrences. This ensures a high +level of trust in their effectiveness for generating distinct and reliable +representations of data, despite the theoretical potential for identical hashes +of different inputs. + + +#info-box(kind: "info")[ + Choosing an appropriate checksum algorithm is paramount due to the rapid + evolution of computational power as described by Moore's Law, which leads to + previously secure algorithms becoming vulnerable as computing capabilities + expand. + + For instance, #gls("MD5") checksums, once deemed secure for storing passwords, + are now easily compromised through brute force attacks. This underlines the + need for an adaptable approach to checksums, continually updating them to stay + ahead of advancements in computational attack strategies. According to + @courtes_2022_6581453[Notes on SHA-1, p.16], the SHA-1 algorithm family is now + approaching end of life. + + To ensure the highest level of security and adaptability to future + computational capabilities, it is advisable to use SHA-2 algorithm family such + as SHA-384 or SHA-512. These algorithms provide a longer bit length, offering + enhanced security and a lower risk of collisions, making them well-suited for + securing sensitive data in the face of evolving technological threats. +] + +==== Reproducibility + +The concept of reproducibility can be applied in many situations, this thesis +will concentrate on a particular application area, thus narrowing its scope. In +this thesis, a computation will typically refer to the process of compiling +source code into a binary file, except in cases where it is explicitly defined +differently. + +Reproducibility is a property of a computation. It is the ability to +consistently obtain identical results across multiple runs of a computation. + +#definition(term: "Reproducibility", name: "def-reproducibility")[ + Reproducibility is a property of a computation satisfying the following + condition: + + #box[ + $ + & forall c in C, forall i in I, forall e_1, e_2 in E, quad "eval"( + c, i, e_1 + ) eq "eval"(c, i, e_2) + $ + ] + + where + - $C$ is the set of all possible computations + - $I$ is the set of all possible inputs arguments + - $E$ is the set of all possible execution environments (hardware, software, + space, time) + + Once that condition is met, the computation is considered to be reproducible. +] + +The set $I$ and $E$, respectically representing the set of all possible inputs +and the hardware and software environment including the date and time, are also +considered as abstractions. In reality, these sets are complex and intricate +entities, that could potentially be composed of many interdependent components. +However, for the purpose of this definition, they are treated as atomic. + +We could consider expanding the list of arguments to achieve greater +specificity, delving deeper into the intricate details that influence +reproducibility. However, the objective here is to provide the reader with an +initial understanding of reproducibility through a formal definition. This +approach is about finding a balance between comprehensive detail and conceptual +clarity, thereby offering a foundational glimpse into the formalism that +underpins reproducible computational processes without becoming mired in +excessive complexity. + +The definition of reproducibility (@def-reproducibility) closely matches the +definition of pure function (@def-pure-function) and, inherently, mathematical +functions. However, as seen in @table-function-computation, understanding the +nuances between theoretical functions and practical computations is essential. +Theoretically, mathematical functions are conceptualised as $I -> R$, reflecting +the abstract nature of mathematics where the function's result $R$ is purely +dependent on its inputs $I$ and external factors are considered non-existent. In +the practical world, this theoretical construct is transposed through an +evaluation function (@def-evaluation). For mathematical functions, this +environment parameter is known and intentionally left empty, symbolising the +deliberate exclusion of external influences and striving to maintain the purity +of the theoretical definition. This is in contrast to practical computations in +programming, where the environment parameter $E$ is often filled with various +real-world parameters and factors, reflecting the nature of computations where +outcomes are influenced by the environment variable. + +#figure( + include "../../resources/typst/functions-vs-computations.typ", + caption: [Nuances between functions and computations], + kind: "table", + supplement: "Table", +) + +This fundamental distinction underscores the challenges of achieving +reproducibility and predictability in the practical realm, necessitating +robustness and adaptability to manage the variability and complexity of +real-world conditions. Together, these definitions provide a comprehensive +paradigm for understanding the interplay between the idealised theoretical +constructs and their practical applications, emphasising the importance of +environmental control in ensuring the computations' reproducibility. The concept +of reproducibility, a computational property, underscores the ability to +replicate results across different environments within $E$, serving as a +cornerstone for verifying and validating scientific work. + +The process of controlling the computational environment $E$ underscores a +fundamental challenge in #gls("SE"): achieving reproducibility through +environment standardisation. The environment often encompasses factors such as +hardware and software configurations, (#gls("CPU"), #gls("OS"), library +versions, and runtime conditions), which can significantly impact a function's +behaviour and output. The Monte Carlo simulation algorithm (@montecarlo-pi.c), +exemplifies this challenge: it may be reproducible at build time but can exhibit +variance at run time due to environmental factors. + +This singularity highlights the essence of reproducibility: the need to +meticulously control or normalise the environment in which computations occur. +By ensuring that ideally environment remains constant, we can more closely +approximate the behaviour of pure computations in practical software systems. +This approach does not merely aim to simplify the computational model but serves +as a strategic endeavor to minimise the unpredictability introduced by varying +environments. + +In conclusion, while the formalism of computations' purity and reproducibility +provides the basis of a theoretical framework, the practical application in +#gls("SE") involves the intricate task of environment management. It is through +this lens that we understand reproducibility not just as a characteristic of the +function itself, but as a holistic property of the entire computational +ecosystem, encompassing both the function and its operating environment. This +broader view acknowledges that while pure functions offer a paradigm for +reproducibility, achieving this in complex, real-world systems often +necessitates rigorous control and standardisation of the computational +environment which is virtually impossible to deliver. + +=== Software Security + +The concept of reproducibility is pivotal in software security for several +reasons. Reproducibility ensures that software can be consistently recreated or +regenerated from its source code, guaranteeing that the software's behaviour +remains unchanged across different builds. This consistency is crucial for +verifying the security of software systems. If a software build is reproducible, +security experts can confidently assess that the build has not been tampered +with or altered to include malicious code. This becomes increasingly important +in an era where cybersecurity threats are both sophisticated and prevalent. + +In the context of software security, reproducibility also aids in the +traceability and verification of software components. It allows for the thorough +examination and validation of all parts of the software, ensuring they are +exactly as intended and free from vulnerabilities or unauthorised alterations. +This traceability is particularly relevant in light of the executive order +14028, #emph[Improving the Nation's Cybersecurity], issued by +@Executive-Order-14028. This document underscores the importance of enhancing +cybersecurity across federal agencies and emphasises the integrity of the +software supply chain. + +The european counterpart, the #gls("CRA") by the European Union reinforces these +efforts by setting cybersecurity requirements for software. This act aims to +reduce vulnerabilities in software products, enhancing security throughout their +lifecycle. Software must come with clear information on their features and +instructions for secure installation, operation, and maintenance. This strategy +reflects a commitment to producing and using reproducible software. + +==== Software Bill Of Materials + +The #gls("SBOM") is an essential element, acting as a detailed inventory of all +the components required to build and operate a piece of software, including all +applied patches and licensing information in a structured and well-known format. + +There are multiple existing formats and standards, the most common ones are: +- #gls("SPDX", long: true): A comprehensive standard maintained by the Linux + Foundation, designed to facilitate license compliance, security, and broader + software component analysis through a detailed documentation approach, + supporting multiple formats like RDF, JSON, and YAML. It caters to a wide + range of stakeholders, including software companies, legal teams, and + open-source projects, with a particular strength in granular licensing + details. +- #gls("CycloneDX", long: false): A lightweight #gls("SBOM") standard aimed at + enhancing application security and managing software supply chain risks. It + emphasises simplicity and efficiency, supporting formats such as XML, JSON, + and ProtoBuf, and is particularly tailored towards the identification of + software components, their vulnerabilities, and risk assessments, making it a + favorite in the application security and #gls("DevSecOps") communities. + +The key differences between the #gls("SPDX") and #gls("CycloneDX") formats lie +primarily in their focus, structure, and community support. The choice between +#gls("SPDX") and #gls("CycloneDX") should be guided by an organisation's +specific needs, whether the focus is on extensive licensing compliance or +streamlined security and risk management within the software supply chain. + +==== Supply Chain + +A software application is composed of many components, each of which is +developed by different teams or organisations. These components are then +composed together into a final product, which is the software application +itself. This process is known as the +#emph[software supply chain] #cite(, form: "normal"). + +Contemporary software development leverages the concepts of composability and +reusability, preferring the integration and reuse of existing libraries over +developing new functionalities from scratch. This methodology enhances +productivity and contributes to the creation of more reliable software by +allowing each component to concentrate on executing a specific function +effectively. Nevertheless, this reliance on external components leads to the +accumulation of both direct and indirect dependencies, complicating the software +supply chain significantly. The build environments, which encompass all +necessary components and their precise versions for software compilation, become +intricate and challenging to replicate across different systems and over time. +This complexity is often described as #emph[dependency hell]. While Semantic +Versioning (@package-managers) offers a strategy to mitigate these issues, it +alone is insufficient to ensure reproducibility +#cite(, form: "normal", supplement: [p.11]). + +To illustrate this concept, the graph in +@python-runtime-dependencies-graph-with-flaw acts as a simplified #gls("SBOM") +for "My App" version `1.2.3`, highlighting its runtime dependencies essential +for the application's operation. This visualization selectively excludes the +build-time dependencies required for the application's compilation to maintain +conciseness. A vulnerability has been identified in `xz` (marked in red), a +critical runtime dependency. Consequently, this vulnerability could potentially +compromise its dependent components (marked in orange), including the +application itself, underscoring the interconnected risk within the software's +dependency graph. This scenario, while being a simplified representation, +mirrors the recent CVE-2024-3094 #cite(, form: "normal") in the +`xz` project #cite(, form: "normal"), which affected numerous software +applications and highlighted the criticality of managing software supply chain +risks. + +#figure( + include "../../resources/typst/my-app-graph-not-ok.typ", + caption: [ + Dependency graph of `my-app` version `1.2.3`, where a flaw has been detected + in `xz` dependency + ], +) + +These issues are known as #emph[supply chain attacks], a type of cyber attack +that targets vulnerabilities in the supply chain of software or hardware +products, with the aim of compromising the final product by infiltrating its +development or distribution process. This can involve tampering with the +production of components, the assembly of systems, or the delivery of software +updates, thereby infecting end users who trust these sources. One particular +aspect of supply chain attacks is that even the original authors of the software +may be unaware that their product has been compromised, as the malicious +alterations often occur downstream in the supply chain. Although not as frequent +as direct attacks on software or systems, supply chain attacks are becoming +increasingly common due to their potential for widespread impact. Gartner +predicts that by 2025, 45% of organisations worldwide will have experienced +attacks on their software supply chains, a three-fold increase from 2021 +#footnote[ + https://www.gartner.com/en/newsroom/press-releases/2022-03-07-gartner-identifies-top-security-and-risk-management-trends-for-2022 +] while Cybersecurity Ventures predicts that the global cost of software supply +chain attacks to businesses will reach nearly \$138 billion by 2031 +#footnote[https://go.snyk.io/2023-supply-chain-attacks-report.html]. +Notable examples include the #emph[Stuxnet] worm in 2010 +#cite(, form: "normal"), +the #emph[Heartbleed] bug discovered in 2014 +#cite(, form: "normal"), and the #emph[SolarWinds] breach in 2020 +#cite(, form: "normal"). These incidents highlight the +exploitation of interconnectedness and inherent trust within the supply chain, +making supply chain attacks particularly insidious and effective methods of +cyber warfare that can simultaneously affect a large number of users or +organisations. + +==== Reproducibility And Security + +Reproducibility is a fundamental aspect of software security, particularly in +the context of the software supply chain. It ensures that software can be +reliably and consistently regenerated from its source code, thereby safeguarding +against malicious alterations or tampering. This is particularly relevant in the +context of supply chain attacks, where the integrity of the software supply +chain is compromised, potentially leading to widespread security breaches. + +It is paramount to have a clear understanding that having something reproducible +doesn't mean that it is secure. It is a necessary condition but not a sufficient +one. If a compiler is flawed, it might produce reproducible builds that could +also be potentially insecure. + +#figure( + { + set text(font: "Virgil 3 YOFF") + image("../../resources/images/security-independent-builds.svg") + }, + caption: [ + The reproducible builds approach to increasing trust in executables built by + untrusted third parties. + ], +) + +In @security-independent-builds inspired from @abs-2104-06020, end-users should +disregard the binary artefact supplied by their software vendor if its checksum +(`806e7...9c271`) diverges from those generated by independent third parties +(`4e14e...4c0a9`). The security of software is deemed more robust when its +reproducibility is confirmed across multiple environments. It is the consensus +among these environments that contributes to the perception of security. The +premise here is not merely the reproducibility, but the uniformity of this +reproducibility across space and time, which strengthens the trust in the +software's integrity and security. + +As cyber threats evolve, ensuring that software can be reliably and consistently +bit-per-bit reproduced from its source code becomes a cornerstone for +maintaining security integrity. Reproducibility not only facilitates the +verification of software for tampering or malicious alterations but also +strengthens trust in software systems amidst the growing complexity of cyber +threats. Therefore, integrating reproducibility into software development and +distribution processes is a crucial step towards enhancing overall cybersecurity +resilience and safeguarding against the ever-increasing sophistication of cyber +attacks. + +=== Reproducibility Utopia + +Reproducibility in #gls("SE") is often considered as an utopia. The exact +replication of a software poses a significant challenge. Thus, while striving +for reproducibility is essential, achieving absolute reproducibility is +frequently unattainable in practice. + +One of the primary impediments in achieving reproducibility lies in the +dependency on hardware architecture. Software compiled for different +architectures, such as `x86` and `ARM,` inherently produces disparate binaries. +These differences stem from the instruction sets and optimizations that are +specific to each platform, leading to divergent outputs despite using identical +source code. This variance highlights a significant reproducibility challenge, +as achieving bitwise identical results across architectures is *not feasible* as +of today. + +Compilers (e.g., GCC, Rustc, #LaTeX, Typst) also play a role in software +development, transforming high-level code into machine-level instructions. +However, not all compilers operate deterministically. In this context, +non-determinism refers to the phenomenon where compilers produce different +outputs given the same input source code across different compilations. Factors +contributing to this non-determinism include variations in memory allocation, +inclusion of timestamps, and embedding of file paths in the binary output. These +variances pose challenges to achieving consistent, reproducible builds. + +#info-box[ + A compiler is essentially an application that transforms input into output. + Tools like GCC are referred to as compilers because they convert high-level + code into machine-level instructions. However, the term #emph[compiler] is not + limited to programming languages alone. For example, #LaTeX is a compiler that + transforms a `.tex` file into a `.pdf` file, rustc compiles a `.rs` file into + a binary file, and Typst compiles a `.typ` file into a `.pdf` file. Typically, + compilers convert human-readable files into machine-readable files. +] + +In @chapter3, acknowledging the reality that full reproducibility may not be +entirely achievable, we will delve deeper into these challenges by exploring the +impact of non-deterministic compilers and the strategies to mitigate these +challenges using different methods. + +== Deterministic Builds And Environments + +In this section, we will explore the concept of deterministic builds, and the +potential sources of non-determinism in software builds. + +The concept of deterministic builds is essential for ensuring reproducibility. A +build is termed #emph[deterministic] when it consistently generates identical +outputs from a given set of inputs, irrespective of the environment or time of +execution. This predictability is central to software reproducibility, yet +several sources of non-determinism frequently challenge its realisation. One +single non-deterministic component in a build process can render the entire +build non-deterministic. Therefore, it is crucial to identify and understand +these sources of non-determinism to ensure reproducibility. Many of these +sources of non-determinism are related to the environment in which the build +occurs. This environment encompasses the hardware, software, and runtime +conditions in which the build process is executed. These factors can +significantly influence the build process, thereby impacting the stability of +the output. + +#definition(term: "Deterministic build", name: "def-deterministic-build")[ + Let $B$ be a build process defined as a function: + + $B: I times E -> O$ + + where + - $I$ is the set of all possible input arguments + - $E$ is the set of all possible execution environments (hardware, software, + space, time) + + then the build B is deterministic if $I times E$ is deterministic: + + $"Determinism"(I times E) -> "Determinism"(B)$ + + where `Determinism` is a function asserting that its argument is + deterministic. +] + +According to @abs-2104-06020, a reproducible build environment is essential for +achieving deterministic and reproducible builds. It ensures consistency in the +software building process by providing a controlled and predictable set of +conditions under which the software can be built. @malka-hal-04430009[p.1] +further elaborate that a build environment is reproducible in both space and +time when it is possible to replicate the same build environment on any machine +and at any point in the past or future​​​​. + +#info-box[ + When a process exhibits a lack of reproducibility over time, it indicates a + fundamental instability within the process. While it would be technically + feasible to replicate the same output in a different environment, within the + same architecture, achieving exact temporal replication of the build process + is practically impossible. This temporal variability serves as a critical + indicator of potential difficulties in ensuring reproducibility across diverse + environments or machines. +] + +=== Sources Of Non-Determinism + +In this section we will explore the sources of non-determinism in software +builds and usage. The list is not exhaustive, it just includes the most common +sources of non-determinism. The list is created from @abs-2104-06020's paper +and information of the @ReproducibleBuildsOrg project, a website aiming at +improving reproducible builds in software development. + +==== Randomness + +Using random data in a computation is a common source of non-determinism and +must be avoided. When random data is required, the solution is to use a +predetermined value acting as a seed to the pseudo-random number generator. +Using a predetermined value as a seed ensures that the same random data is +generated each time the computation is executed, thereby guaranteeing +reproducibility. + +Hardcoding the seed in the source code would be nonsensical because it wouldn't +be random anymore, the seed should be passed as a parameter to the computation. +This parameter can be passed as a command-line argument, an environment +variable, or a configuration file, leaving the responsibility to the user to +provide a seed. + +==== Build Paths + +Build paths are paths used by the source code to locate files and resources. +Sometimes, it can happen that absolute paths are used in the source code, which +means that the build will only be reproducible on the same machine where it was +built. + +To avoid this, relative paths should be used instead of absolute paths and +sometimes post-processing is required to remove the build path or to normalize +it with a predefined value. + +==== Volatile Inputs + +Volatile inputs are inputs that can change over time. For example, the current +date and time are volatile inputs, network streams as well. Dealing with date +and time will be done in @timestamps. For network streams, the solution is to +never rely on remote data while building. Instead, the data should be downloaded +beforehand and stored locally. + +This is a common issue in the context of software compilation, where the build +process might download dependencies from the internet during the build. + +==== Package Managers + +Package managers are tools that automate the process of installing, upgrading, +configuring, and removing packages, typically from a central repository or +package registry. They are widely used in software development to manage +dependencies and facilitate the build process. For example, `Cargo` for Rust, +`Composer` for PHP, `NPM` for NodeJS, `Dune` for OCaml. They are also used to +manage software at the operating system level like: `apt` in Debian based +distributions, `pacman` in Arch Linux, `dnf` in Fedora, `brew` in MacOS, +`chocolatey` in Windows. + +Package managers can inadvertently introduce non-determinism by automatically +downloading or updating dependencies to their latest versions. This process can +lead to inconsistencies, particularly when a newer version of a package includes +changes that are incompatible with the project's codebase. To mitigate this, the +#gls("SemVer") scheme is widely adopted, offering a structured version naming +convention that aids dependency management. However, while packages may declare +#gls("SemVer") compliance, adherence levels vary, with some strictly following +#gls("SemVer") principles and others adopting them more leniently +#cite(, form: "normal", supplement: [p.5]). Notably, there has been a +trend towards increasing adoption and stricter adherence to #gls("SemVer") +principles by package managers over time +#cite(, form: "normal", supplement: [p.13]). It provides a structured +version naming convention designed to convey the nature of changes between +releases, thereby aiding in the management of dependencies with a syntax that +succinctly specifies version constraints. While this mechanism greatly +facilitates dependency resolution by leveraging a minimalistic syntax, it +inherently permits variability over time, potentially compromising +reproducibility. + +#figure( + sourcefile( + file: "composer.json", + lang: "json", + read("../../resources/sourcecode/composer.json"), + ), + caption: [A `composer.json` file, used by the PHP package manager, Composer], +) + +In @composer-json, the dependency `foo/http` is specified with version `^1`, +where the caret symbol (`^`) indicates that Composer should install the latest +minor version within the major version `1`. In contrast, the dependency +`foo/bar` is locked to version `1.2.3`, signalling that Composer must install +that specific version, regardless of newer releases. This distinction +underscores the importance of using package managers judiciously to achieve +determinism. For Composer, determinism is further ensured by including a +`composer.lock` file in the project, which explicitly pins each dependency to a +particular version, thus facilitating reproducibility. The decision to require +this file varies by project and is not in the scope this master thesis. + +Ensuring reproducibility in the context of package managers is particularly +challenging due to the amount of different ecosystems and the lack of +standardisation. For example, in the Python realm#footnote[ + https://linuxfr.org/news/l-installation-et-la-distribution-de-paquets-python-1-4 +], there have been and there are still multiple package manager ecosystems: +`distutils`, `setuptools`, `pip`, `pypi`, `venv`, `conda`, `anaconda`, `poetry`, +`hatch`, `rye`. Each of these has its own configuration file format, which can +be used to specify the version of each dependency. However, there is no +standardisation which makes it difficult to ensure reproducibility. The same +issue applies to operating system's package managers. For example, in Debian +based distributions, there are multiple package managers: `apt`, `aptitude`, +`dpkg`. + +The solution would be to use a universal package manager that would work for all +Linux distributions and programming languages. This is what tools like +`AppImage`, `snap` and `flatpak` are trying to solve, only at the level of the +operating system. These tools are partially fixing the issue by just being +available only for installing package at the operating system level. + +These tools, while being a step in the right direction, are also coming with +their own set of issues, like the lack of standardisation between them, the lack +of adoption and the lack of support from major distributions. + +There are also package managers like `Nix` and `Guix` that are trying to solve +the issue by being universal. They provide a way to build and install packages +in a sandboxed environment, which means that packages are isolated from the rest +of the system at build time. This is a great way to ensure reproducibility, we +will discover them in @chapter4 + +==== Version Information + +Version information like commit identifiers can be used to precisely identify +the source code used to build a program. + +#figure( + shell(read("../../resources/sourcecode/listing-typst-version.log")), + caption: [Example of program including a commit ID], +) + +As illustrated in @listing-typst-version, incorporating specific version +information, such as a commit ID, helps reproduce a build by facilitating the +retrieval of an identical source code version. Nevertheless, the efficacy of +commit IDs as reproducibility anchors remains debatable. These identifiers may +frequently be unavailable at the time of build. It is essential to recognize +that `git`, a distributed version control system designed to handle everything +from small to very large projects with speed and efficiency, metadata, including +commit IDs, is not an intrinsic element of the source code. Instead, it is part +of the version control system in use. `git` allows multiple developers to work +together on the same project simultaneously, providing a robust system for +tracking changes, version history, and collaboration. However, the potential for +easy substitution of one version control system for another renders reliance on +such ephemeral metadata a precarious foundation for software reproducibility. + +In scenarios where a version number is necessary, it can be derived from a +dedicated file, such as a changelog or eventually provided through an +environment variable. This approach decouples the versioning process from the +underlying version control system, potentially offering a more stable and +reliable method for software version identification. + +==== File Order + +It is important to ensure that processing multiple files in a stable order +remains stable. + +Listing files relies on the low-level #gls("POSIX", long: false) call `readdir`, which itself is +dependent on the filesystem in use and therefore doesn't guarantee any +consistent ordering. + +#info-box(kind: "info")[ + According to @LibCManual[p.415]: The order in which files appear in a + directory tends to be fairly random. A more useful program would sort the + entries before printing them. + + In @tlpi[p.354]: The filenames returned by `readdir()` are not in sorted + order, but rather in the order in which they happen to occur in the directory. + This depends on the order in which the file system adds files to the directory + and how it fills gaps in the directory list after files are removed. +] + +There are numerous situations where relying on an existing list of files can +result in non-determinism. For instance, when generating an archive from the +contents of a directory, many file systems do not provide consistent ordering +when listing files within that directory. Consequently, the arrangement of files +in the archive may differ between builds, causing unpredictable archives. +Although these archives might contain identical content, they could have been +compressed with varying file orders. + +To address this, one could enforce a stable order by explicitly sorting the +inputs before processing them. This can be done by sorting the list of files in +the directory based on a specific criterion, such as their names or modification +timestamps. + +#figure( + shell(read("../../resources/sourcecode/tar-sort-name-flag.log")), + caption: [ + Use of `--sort=name` flag to ensure a stable order of files in an archive + ], +) + +==== Timestamps + +Timestamps are among the biggest sources of non-determinism in software builds, +as they can lead to differences due to changing times between builds. Since +reproducibility checks the content of the output and its metadata, building +multiple times some source code will create output artefacts with possibly the +same content but with different metadata, like file timestamps, making them +irreproducible. + +Often, timestamps are used to approximate which version of the source were +built. Since file timestamps are volatile, the source code needs to be tracked +more accurately than just a timestamp. Just like for version information, the +solution would be to extract the date from a dedicated file like a changelog, or +a specific commit #cite( ,form: "normal"). + +To circumvent this issue, `SOURCE_DATE_EPOCH` is a specific environment variable +convention for pinning timestamps to a specific value that has been introduced +by the `reproducible-builds.org` community and it is now widely used by many +compilers and build tools. + +Another option is to use `libfaketime`, a library that intercepts system +function calls retrieving the current time of day and replies with a predefined +date and time instead. + +When none of these options are viable, using a tool like `strip-nondeterminism` +#cite(, form: "normal") is a temporary workaround for +stripping non-deterministic information such as timestamps and filesystem +ordering from various file and archive formats. + +==== Locale Environment Variables + +#figure( + shell(read("../../resources/sourcecode/date-format-flags.log")), + caption: [Use `LC_ALL` and `-u` flags to configure the date format], +) + +`LC_ALL` is a locale environment variable that can modify various aspects of an +application's behaviour. It can change the date format, string collation order, +and character encoding. Although each parameter can be set individually, +`LC_ALL` enables you to configure them all simultaneously and override any other +locale environment variables. + +In @listing-date-format-flags, we methodically incorporate various flags, such +as `-u`, and the `LC_ALL` environment variable to the `date` command. This +approach ensures that the output we receive is predictable and consistent, +regardless of the underlying system configuration. + +=== Comparing Builds + +In the quest for software reproducibility, identifying and understanding the +differences between two builds of the same software becomes paramount, +especially when those builds are not identical. This section introduces a tool +designed specifically for this purpose. + +Developed under the umbrella of the @ReproducibleBuildsOrg effort, `diffoscope` +#cite(, form:"normal") is a comprehensive, open-source tool that +excels in comparing files and directories. Its unique capability to recursively +unpack archives of various types and transform binary formats into a +human-readable form makes it an indispensable tool for software comparison. It +seeks to simplify the process of identifying discrepancies between software +builds. This functionality is crucial for developers and researchers striving to +pinpoint and resolve the causes of non-reproducibility. An online version of the +tool is also available#footnote[https://try.diffoscope.org/]. + +To demonstrate the effectiveness of `diffoscope` in identifying differences +between non-reproducible builds, @bash-gcc-not-reproducible-builds considers +the hypothetical example of a simple program that outputs the current date and +time. Due to its nature, compiling this program twice, even with the same source +code, will inherently produce two different builds. + +First, we compile the sourcecode twice, creating `build1` and `build2`: + +#figure( + shell( + read("../../resources/sourcecode/bash/bash-gcc-not-reproducible-builds.log"), + ), + caption: [ + Compilation of non-reproducible programs and the use of their checksums for + comparison + ], +) + +Then, we use `diffoscope` to compare these builds: + +#figure( + shell(read("../../resources/sourcecode/bash/bash-diffoscope-comparison.log")), +) + +The tool will generate a detailed report (@diffoscope-report) highlighting the +differences between `build1` and `build2`. In this hypothetical example, +differences might include timestamps or other build-specific metadata embedded +within the binary. + +#figure( + { + image("../../resources/images/diffoscope-report.svg") + }, + caption: [A `diffoscope` report using HTML format], +) + +=== Fixing Builds + +In this subsection, we delve into strategies for addressing non-reproducible +builds, acknowledging the vast array of potential causes and the impossibility +of covering every solution comprehensively. + +Previously in @bash-gcc-not-reproducible-builds, we encountered an issue, where +compiling the sourcecode (@datetime.c) twice resulted in different binaries. +Using `diffoscope`, we identified, as shown in @diffoscope-report, the source of +variability as date and time strings embedded within the binaries. + +A solution has been proposed in @timestamps, we can leverage the +`SOURCE_DATE_EPOCH` environment variable to address this specific challenge in +achieving reproducible builds. This approach standardises the date and time used +during the build process, ensuring consistency across compilations and thus +contributing to reproducibility. + +#figure( + shell(read("../../resources/sourcecode/bash/bash-fixing-builds.log")), + caption: [Fix builds using an environment variable], +) + +== Conclusion + +This chapter embarked upon a detailed journey through the landscape of +reproducibility, focusing particularly on its pivotal role within the realms of +science and, more specifically, #gls("CS") and #gls("SE"). Through rigorous +analysis, we unveiled the multifaceted nature of reproducibility. + +We dissected the concept of reproducibility, from its foundational elements in +science to its intricate implications in computer science, delineating the +essential terminology that frames our discussion: computations, pure and impure +functions, inputs, outputs, and the environmental variables that intertwine to +influence reproducibility. The exploration into deterministic builds and the +sources of non-determinism not only highlights the inherent challenges but also +sets the stage for the subsequent focus on the tools and methodologies designed +to tame these complexities. + +As we pivot toward the next chapter, our narrative will transition from the +theoretical underpinnings to the practical arsenal at our disposal for enhancing +reproducibility in #gls("SE"). While the groundwork laid in this chapter paves +the way for an in-depth exploration, it is important to acknowledge the vast +landscape of tools and methodologies available in this domain. Given the scope +of this thesis, we will focus on four evaluation methods using three key tools, +with the understanding that this selection is not exhaustive but rather +representative of the broader ecosystem. Through the lens of real-world +applications and case studies, we will explore how these chosen tools are used +to mitigate the challenges identified herein and to foster an ecosystem where +reproducible research and development are not merely aspirational goals but +operational norms. + +In fine, this chapter serve as both a foundation and a bridge. It offers a +comprehensive understanding of reproducibility that is critical for appreciating +the significance of the solutions and methodologies discussed in the chapters +that follow. It is within this framework that we continue our quest to demystify +reproducibility, moving from conceptuals clarity to practical application, with +the ultimate aim of enhancing the reliability, security, and transparency of +#gls("SE") practices. diff --git a/src/thesis/3-tools.typ b/src/thesis/3-tools.typ new file mode 100644 index 0000000..d526404 --- /dev/null +++ b/src/thesis/3-tools.typ @@ -0,0 +1,1023 @@ +#import "imports/preamble.typ": * +#import "theme/template.typ": * +#import "theme/common/titlepage.typ": * +#import "theme/common/metadata.typ": * +#import "theme/disclaimer.typ": * +#import "theme/leftblank.typ": * +#import "theme/acknowledgement.typ": * +#import "theme/abstract.typ": * +#import "theme/glossary.typ": * +#import "theme/infos.typ": * +#import "theme/definition.typ": * + +#chapterquote( + title: "Software evaluation", + ref: "chapter3", + quoteAttribution: , + quoteText: [ + Any sufficiently advanced technology is indistinguishable from magic. + ], +) + +This chapter explores the pivotal role of tooling in achieving reproducibility +within #gls("SE"), highlighting the importance of environment consistency, +dependency management, and process isolation. + +Reproducibility in #gls("SE") is not merely a desirable attribute but a +cornerstone of trustworthy, reliable, and verifiable software development +practices. As software systems grow increasingly complex and integral to every +facet of the modern world, from critical infrastructure to personal devices, the +stakes for ensuring their reproducibility have never been higher. This chapter +introduces and examines four distinct methods for building software, each with +its unique approach: + +- Bare compilation + + It is the most rudimentary method, depends on the operating system's compilers + and libraries for software construction. + +- Compilation with Docker + + Using containerization technology, encapsulates not just the software and its + dependencies but also the entire runtime environment. + +- Compilation with Nix + + Nix uses a unique store for packages built in isolation, each with a unique + identifier that includes dependencies, preventing conflicts and ensuring + reproducible environments. + +- Compilation with Guix + + Inspired by Nix, Guix offers a transactional package management system that + isolates dependencies to ensure consistent and reproducible software + environments through specific version-linked profiles. + +The four evaluation methods chosen for detailed evaluation in the context of +reproducibility represent a wide range of approaches to managing software build +environments, each addressing different aspects of reproducibility. Bare +compilation was selected to provide a baseline, demonstrating the fundamental +challenges encountered without the aid of advanced tooling, such as +environmental inconsistencies and dependency conflicts. This method serves as a +contrast to the more sophisticated techniques that follow. Docker is included +for its widespread adoption and popularity, as well as its approach to +encapsulating the runtime environment, which significantly mitigates issues +arising from system variability. Guix and Nix are examined due to their unique +approach to dependency management and environment isolation, employing a package +management approach that is based on the functional paradigm +(@def-functional-package-management) to ensure exact reproducibility of +environments across different systems. The chapter aims to cover a spectrum from +the most basic to the most advanced strategies. + +#definition( + term: "Functional package management", + name: "def-functional-package-management", +)[ + From @10-1007-978-3-319-27308-2_47, functional package management is a + discipline that transcribes the functional programming paradigm to software + deployment: build and installation processes are viewed as pure functions + (@def-pure-function) in the mathematical sense whose result depends + exclusively on the inputs (@def-inputs-outputs), and their result is a value + that is, an immutable directory. +] + +This chapter aims to provide readers with an understanding of how these +contribute to the broader goal of reproducible #gls("SE"). Through a detailed +exploration of each approach, readers will gain insight into the strengths, +weaknesses, and applicability of Bare compilation, Docker, Guix and Nix in +various software development scenarios. + +== Methodology + +Our primary objective is to assess the reproducibility of a software build using +four different methods: Bare compilation, Docker, Guix, and Nix. By compiling a +C program (@datetime.c) with each tool, we can evaluate reproducibility both +over space and time (@reproducibility). + +The study uses a quantitative research design, focusing on the comparison of +binary files generated from compiling identical source code with different +methods, on the same environment. This approach allows for an empirical +assessment of reproducibility challenges inherent to each compilation +tool and environment. + +=== Evaluation Criteria + +We will consider three primary criteria. + +Firstly, *reproducibility in time* assesses whether the outputs of builds are +identical across repeated compilations in the same environment. This criterion +involves compiling the same source code twice with a few seconds of interval +between compilations. By comparing the outputs of these compilations, we can +determine if the build process produces consistent results over time. + +Secondly, *reproducibility in space* focuses on the consistency of build outputs +across different environments. To evaluate this, the same source code is +compiled in various environments. This process helps to ensure that the software +build process is not dependent on specific environmental factors and can produce +identical outputs regardless of where it is compiled. + +Lastly, the *reproducibility of the build environment* evaluates the stability +and consistency of the environment itself, including the dependencies required +for building the output. This criterion ensures that the environment, which +encompasses all necessary tools and libraries, remains stable and consistent +across different instances and setups. + +=== Tools And Technologies + +The evaluation of reproducibility tools in this study encompasses several +approaches to software compilation and package management, each with its unique +methodology. + +In @ch3-tool1, the bare compilation method involves direct compilation on the +host system without the use of containerization or package management tools.This +approach relies on the default tools and libraries installed in the operating +system, providing a straightforward but less controlled environment for building +software. This method is assessed to understand the baseline reproducibility and +potential variability introduced by the host system's native environment. + +In @ch3-tool2, Docker is used to provide a containerized environment for +software compilation. Using Docker containers ensures that the build process +occurs in a consistent and isolated environment, independent of the host +system's configuration. This method helps in evaluating how containerization can +enhance reproducibility by encapsulating all necessary dependencies and tools +within a controlled and replicable environment. + +In @ch3-tool3, the Guix package ecosystem is employed to manage the software +build process. Guix focuses on providing a reproducible and declarative approach +to package management, ensuring that the build environment and dependencies are +precisely defined and versioned. This approach is examined for its ability to +maintain consistency and reproducibility across different systems and +environments by leveraging Guix's robust package management features. + +In @ch3-tool4, the Nix package ecosystem is used to manage and build software. +Similar to Guix, Nix offers a declarative and reproducible package management +system, allowing for precise control over the build environment and +dependencies. The evaluation of Nix focuses on its capability to provide a +reproducible build environment that can be consistently replicated across +various systems, enhancing the reliability and stability of the software +development process. + +=== Scenarios + +Our examples and builds focus on custom-made scenarios to highlight the +differences in reproducibility across the four tools. There are multiple +scenarios being evaluated: + +In the first scenario, using @ch3-tool1, a C program is built using the host +default C compiler. The second scenario involves @ch3-tool2, where a C program +is built in a Docker container utilizing the C compiler. The third scenario, +with @ch3-tool3, involves building a C program using Guix. Finally, there are +two scenarios for @ch3-tool4: one involves building a C program using Nix legacy +(not flake), and the other uses Nix flake to build the same program. + +=== Compilation And Execution + +A trivial C program (@datetime.c) has been chosen for its straightforwardness, +allowing the focus to remain on the build process and output rather than +software complexity. + +Each method will compile the same C program (@datetime.c) twice. Detailed steps +for compilation and execution within each environment will be documented, +ensuring transparency and reproducibility of the process itself by the readers. +Each compilation's resulting output will be executed to verify functionality, +although the correctness of the execution's output will not be evaluated. + +=== Environment Setup + +To ensure the robustness and universality of our reproducibility assessment, all +test scenarios described in this chapter are executed through GitHub Actions +#cite(, form: "normal"). GitHub Actions is an automation platform +that enables #gls("CICD"), allowing builds to be performed, tested, and deployed +across various machines and architectures directly from GitHub repositories. + +Our testing environments supports three distinct architectures: + +- `x86_64-linux`: This represents the widely used Linux operating systems on + Intel and AMD processors. To ensure a thorough evaluation, two instances, each + running the different versions of Ubuntu (`20.04` and `22.04`), are employed. +- `x86_64-darwin`: Dedicated to supporting macOS on Intel processors. +- `aarch64-darwin`: Addressing the latest generation of macOS powered by the + ARM-based Apple Silicon processors. + +This selection encompasses both `x86` and `ARM` architectures, as well as Linux +and MacOS operating systems, providing a comprehensive view of reproducibility +across the most commonly used development platforms in #gls("SE"). The choice of +these architectures ensures the results are relevant to a broad spectrum of +development environments and application targets. + +Each of our scenarios is streamlined through the use of a `Makefile`. A +`Makefile` as seen in @ch3-example-makefile is a text file that contains a set +of directives used by the GNU `make` #cite(form: "normal", ) utility to +automate the build process of software projects. These directives contain +specific shell commands. + +#figure( + { + sourcefile( + file: "Makefile", + lang: "Makefile", + read("../../resources/sourcecode/example-makefile"), + ) + }, + caption: [An example of `Makefile` used in a scenario.], +) + +Each scenario's `Makefile` essentially contain four essential steps: + +- `clean`: Removes the build artefact of a build process, if any. +- `build`: Executes a build process, generating an output artefact. +- `check`: Prints the checksum of the build artefact. +- `run`: Execute the artefact + +Incorporating these `Makefile` steps into our GitHub Actions workflows not only +automates the execution of each scenario, ensuring consistency and repeatability +in our testing process, but also empowers the reader with the ability to locally +reproduce the steps outlined in this document in full transparency. This +approach facilitates and encourages the direct replication of methods and +scenarios, aligning with best practices in #gls("SE") for reproducibility, but +also extends these principles to broader scientific research practices. + +=== Output Comparison + +To compare the results, we will compare the checksums of the resulting outputs. +We exclusively use the `nix hash path` command provided by the Nix package +manager to compute the hash of a path. + +#info-box(kind: "important")[ + The `nix hash path` command is provided by Nix, a tool we will explore in this + chapter. Nix provides this command as part of its suite, but it can be applied + anywhere, not just to files within the Nix ecosystem. This command + distinguishes itself by its capacity to hash directories in addition to files. + An alternative to this approach could have been the use of a + #gls("SWHID") #cite(,form:"normal"). +] + +The `nix` command is available on systems with Nix installed. The difference +with a traditional `sha256sum` is that the former computes the hash of the path, +which includes the content and the metadata while the latter computes the hash +of the content only. Another advantage of using that command is its ability to +create a hash prefixed by the algorithm used, similar to #gls("SRI") +#cite(, form: "normal") hashes. + +=== Expected Outcomes + +At the opposite of the previous more theoretical chapters, this practical +chapter aims to empirically compare the differences in reproducibility +achievable with Bare compilation, Docker, Guix, and Nix. Insights into the +challenges and benefits of each method will inform best practices in #gls("SE") +for achieving reproducible builds. + +== Evaluation 1 - Bare compilation + +This method is the most rudimentary approach to software compilation, relying on +the host system's installed compilers and libraries to build software. This +build method correspond to Scenario 1, with the corresponding `Makefile` in +@ch3-makefile-scenario1, that can be executed on any system, with the commands: +`make build` to compile, `make check` to print the checksum, #raw("make run") to +run the compiled binary. As explained in @ch3-compilation-execution, we notice +that the steps are executed twice and in @ch3-tool1-build, the steps to build, +check and run the build are detailed. + +#figure( + { + sourcefile( + file: "Makefile", + lang: "Makefile", + read("../../lib/scenario-1/Makefile"), + ) + }, + caption: [`Makefile` of Scenario 1], +) + +#figure( + { + shell(read("../../resources/sourcecode/scenario-1.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Terminal log of the steps to build, check and run Scenario 1], +) + +At lines 4 and 9 of @ch3-tool1-build, we notice that the `make check` step +prints two different checksums, indicating that the output of the two builds is +different at each run. As a result, this build is not reproducible. This +discrepancy in the output is likely caused by the dynamic replacement of the +`__DATE__` and `__TIME__` macros in the source code, which are replaced with the +current date and time at the moment of compilation. + +#heading(outlined: false, level: 3, "Reproducibility In Time") + +This method involves directly compiling source code on a system with only the +essential compilers and libraries available on the host. This method's primary +advantage lies in its simplicity and direct control over the build process, +allowing for a clear understanding of dependencies and compilation steps. +However, it lacks isolation from the system environment, leading to potential +#emph[it works on my machine] issues due to variations in system configurations. +Additionally, the lack of encapsulation and dependency management can lead to +difficulties in achieving consistent and reproducible builds across different +environments. This method is therefore classified as non-reproducible in time. + +#heading(outlined: false, level: 3, "Reproducibility In Space") + +This method is not reproducible in time, therefore we will consider it as not +reproducible in space either. Technically it would be possible to reproduce the +same output on another environment, but it would be practically impossible to +run the build at exactly the same time. This method is therefore classified as +non-reproducible in space. + +#heading(outlined: false, level: 3, "Reproducibility Of The Build Environment") + +The virtual machines used on Github Actions are versioned. However, the software +installed on the images are not. From one build to another, we can have +a different version of `gcc` or any other software installed on the image. +Therefore, we have absolutely no control over the build environment and it is +very complicated to reproduce the same environment on another machine. +Therefore, reproducibility of the build environment is not guaranteed. + +== Evaluation 2 - Docker + +Docker #cite(,form:"normal") has revolutionised software deployment by +encapsulating applications in containers, ensuring they run consistently across +any environment. Unlike traditional virtual machines, Docker containers are +lightweight, share the host's kernel, and bundle applications with their +dependencies, promoting the principle of #emph["build once, run anywhere"]. This +approach streamlines development, testing, and production workflows, +significantly reducing compatibility issues and, to some extent, simplifying +scalability. + +Central to Docker's appeal is its contribution to the #gls("DevOps") movement, +fostering better collaboration between development and operations teams by +eliminating the #emph["it works on my machine"] problem. Docker's ecosystem, +including the Docker Hub #cite(, form: "normal"), offers a vast +repository of container images, facilitating reuse and collaboration across the +software community. + +Docker uses the #gls("OCI") standard for its container images, ensuring +interoperability across different containerization technologies, including +@podman and @kubernetes. The #gls("OCI") specification outlines a format for +container images and a runtime environment, aiming to create a standard that +supports portability and consistency across various platforms and cloud +environments. + +Due to its popularity, Docker is a key player in modern software development, +enabling efficient, consistent, and scalable applications through +containerization, supporting agile and #gls("DevOps") practices, and +accelerating the transition from development to production. + +#figure( + sourcefile( + file: "Dockerfile", + lang: "dockerfile", + read("../../lib/scenario-2/Dockerfile"), + ), + caption: [From Scenario 2, the `dockerfile` used by Docker], +) + +This method involves creating an #gls("OCI") image, compiling @datetime.c, +through a `Dockerfile` and setting the compilation result as default command as +shown in @ch3-dockerfile. This ensures that each time the image is executed, the +compiled executable runs within the container. However, instead of printing only +the checksum of the resulting binary, the `check` step also outputs the checksum +of the image. + +#figure( + shell(read("../../resources/sourcecode/scenario-2.log")), + supplement: "Terminal session", + kind: "terminal", + caption: [Terminal log of the steps to build, check and run Scenario 2], +) + +#heading(outlined: false, level: 3, "Reproducibility In Time") + +In @ch3-docker-build, it is observed on lines 5 and 12 that building the image +twice and extracting the resulting binary produces different checksums. +Additionally, on lines 6 and 13, it is evident that the checksums of the images +are inevitably different. Consequently, this method is classified as +non-reproducible over time. + +#heading(outlined: false, level: 3, "Reproducibility In Space") + +This scenario was executed on various machines and architectures, resulting in +different binaries and images. Therefore, this method is classified as +non-reproducible in space as well. + +#heading( + outlined: false, + level: 3, + "Reproducibility Of The Build Environment", +) + +The reproducibility of build environments in Docker images, while generally +reliable in the short term, can face challenges over time. Docker images are +built on layers, often starting from base images provided by specific vendors. +These base images can receive updates that alter their contents, meaning a +`Dockerfile` that successfully built an image at one time might not produce an +identical image later due to changes in its base layers. Additionally, not +pinning specific versions of base images and external dependencies in the +`Dockerfile` can lead to inconsistencies, making the exact reproduction of a +Docker environment challenging if not managed carefully. Therefore, while Docker +simplifies the consistency of deployment environments, ensuring long-term exact +reproducibility requires careful management of image sources and dependencies. + +Docker is intrinsically designed to facilitate reproducible builds, with the +capability to generate identical containers across multiple executions. However, +the challenge to reproducibility arises not from Docker's fundamental features +but from the use of specific base images within Docker containers. A significant +illustration of this problem is shown in @ch3-docker-build, where rebuilding the +image results in different containers even though the base image version has +been pinned to a specific commit at lines 1 and 7. + +#info-box(kind: "info")[ + "Pinning" refers to the practice of specifying exact versions of software, + base images, or dependencies to use when building a Docker container. This + practice helps ensure that the build environment remains consistent and + predictable over time, despite updates or changes to those dependencies. + Pinning is crucial for maintaining consistency as it prevents the build + environment from changing unexpectedly due to updates in dependencies. It also + enhances reproducibility, allowing developers to recreate the same environment + at a later date, which is vital for debugging and development. Moreover, it + enhances reliability by reducing the likelihood of encountering unexpected + issues or conflicts caused by differing versions of dependencies. + + For example, specifying `FROM alpine:3.19.1` in a `Dockerfile` instead of + `FROM alpine` ensures that the Alpine 3.19.1 version is always used, providing + stability. This mechanism applies similarly across different programming + language ecosystems. However, it is important to note that version tags, like + `3.19.1`, can be replaced or updated by the maintainers, potentially altering + the contents associated with a #emph[pinned] version. + + To overcome this, the use of digests or checksums (@checksum) can anchor + images to a specific snapshot, offering a stronger guarantee of immutability. + For instance, specifying + `FROM alpine@sha256:c5b1261d6d3e43071626931fc004f70149baeba2c8ec672bd4f27761f8e1ad6b` + as shown in @ch3-dockerfile ensures that exactly the same image is used + consistently, regardless of any updates. +] + +Docker's containerization technology offers a way to create consistent software +environments across various systems by encapsulating both the software and its +dependencies within containers. This encapsulation aids in ensuring a uniform +deployment process. However, the approach's reliance on base images and the +package managers they use brings forth challenges in maintaining +reproducibility. This is primarily because base images might not be strictly +version-controlled, and the package managers used within these images can result +in the installation of varying dependency versions over time. + +For example, traditional package managers like `apt` (used in Debian-based +#glspl("OS")) or `yum` (used in RedHat-based #glspl("OS")) do not +inherently guarantee the installation of the exact same version of a software +package across space and time. Typically, this variability stems from updates in +the package repositories, where an `apt-get install` command might fetch a newer +version of a library than was originally used. Such updates could potentially +introduce unexpected behaviour or incompatibilities. + +Docker and similar containerization technologies act as sophisticated +assemblers, piecing together the diverse components required to create a +container. This process, while streamlined and efficient, is not immune to the +introduction of variability at any stage of the assembly line. Whether due to +updates in base images, fluctuations in package versions, or differences in +underlying infrastructure, these variables can compromise the reproducibility of +the resulting container (@def-deterministic-build). Recognising this, it becomes +crucial for developers and researchers to approach container creation with a +keen awareness of these potential pitfalls. By meticulously managing base +images, employing reliable package managers, and adhering to best practices in +`Dockerfile` construction, one can mitigate the risks of variability and move +closer to achieving true reproducibility in containerised environments. + +== Evaluation 3 - Guix + +@guixwebsite is an advanced package manager, designed to provide reproducible, +user-controlled, and transparent package management. It leverages functional +programming concepts to ensure reproducibility and reliability, using the GNU +Guile #cite(, form:"normal") programming language for its core daemon, +package definitions and system configurations (@courtes2013functional). + +Central to Guix's philosophy is the concept of reproducible builds and +environments. This ensures that software can be built in a deterministic manner, +enabling exact replication of software environments at any point in space and +time. Guix achieves this by capturing all dependencies, including the toolchain +and libraries, in a way that they can be precisely recreated. It supports +transactional package upgrades and rollbacks, making system modifications +risk-free by allowing users to revert to previous states easily. + +Guix uses @guile, a Scheme #cite(, form:"normal") +implementation, allowing for more expressive and programmable package +definitions. This choice reflects Guix’s emphasis on customization and alignment +with the @fsfwebsite project's philosophy, rejecting proprietary blobs and +aiming for complete software freedom, which may limit hardware compatibility. +Guix’s approach can pose a high entry barrier due to its use of a +general-purpose functional programming language but offers extensive flexibility +for those familiar with Lisp-like languages. That said, users are free to extend +Guix with custom packages, free or not. + +Guix is committed to ensuring reproducibility and reliability, based on the +functional deployment model first introduced by @Dolstra2006. It assures +reproducible builds by treating software environments as immutable entities, +thereby minimising variability across different systems. Guix's approach to +software building and package management, grounded in the principles of +functional programming and transactional package upgrades, places a strong +emphasis on reproducibility. However, this functional paradigm +(@def-functional-package-management) introduces a learning curve and +necessitates a shift from traditional imperative package management methods. +Additionally, the adoption of Guix might be further complicated by the absence +of non-free software availability, marking a significant consideration for teams +considering Guix. + +#figure( + { + sourcefile( + file: "guix.scm", + lang: "Lisp", + read("../../lib/scenario-3/guix.scm"), + ) + }, + caption: [From Scenario 3, the Guix build file (`guix.scm`)], +) + +#figure( + { + shell(read("../../resources/sourcecode/scenario-3.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Building the C sourcecode from the Guix build file of Scenario 3], +) + +#heading(outlined: false, level: 3, "Reproducibility In time") + +In @ch3-guix-build, we notice on lines 5 and 11 that the output hashes are the +same. This is therefore classified as reproducible in time. + +#heading(outlined: false, level: 3, "Reproducibility In Space") + +Building the program in a different environment with the same architecture +(`x86_64-linux`) resulted in identical output. Compiling the source code on +another architecture (`aarch64_darwin`) also produced consistent results, though +different from those obtained on `x86_64-linux`. Therefore, we can conclude that +the program is reproducible across different environments, #emph[modulo] the +hardware architecture. + +#heading(outlined: false, level: 3, "Reproducibility Of The Build Environment") + +The reproducibility of the build environment is heavily controlled when using +Guix. The dependencies are locked and pinned, it is simply not possible to +create a different build environment. + +== Evaluation 4 - Nix + +@nix is a revolutionary package management system that dramatically reshapes the +landscape of software construction, consumption, deployment and management. Its +distinctive methodology, grounded in the principles introduced in @Dolstra2006, +marked its inception, setting a new standard for handling software packages. +Central to Nix's core is its use of the Nix language, a domain specific +Turing-complete language that facilitates the description of software packages, +their dependencies, and the environments in which they operate. + +#info-box(ref: "def-turing-complete")[ + The term "Turing-complete" is named after the British mathematician and + logician Alan Turing, who introduced the concept of a Turing machine as a + fundamental model of computation. A Turing-complete language is a programming + language that can simulate a Turing machine, a theoretical device that can + solve any computation that can be described algorithmically. Turing + completeness is a fundamental property of any programming language that can + perform any computation that a Turing machine can, given enough time and + memory. This property allows a language to express any algorithm or + computation, making it a powerful tool for software development. Examples of + Turing-complete languages include: Python, PHP, C++ and JavaScript. On the + other hand, non-Turing-complete languages, which are limited in their + computational capabilities, include: SQL, Regex and HTML. +] + +This language enables Nix to implement a functional deployment model, ensuring +reproducibility, reliability, and portability across different systems by +treating packages as functions of their inputs, which results in deterministic +builds. + +Nix emphasises a deterministic build environment, allowing developers to specify +and isolate dependencies explicitly. This method significantly mitigates +#emph["it works on my machine"] issues by providing a high degree of control over +the build environment. Nix's strength in ensuring reproducibility comes with the +need to embrace its unique approach to system configuration and package +management, representing a paradigm shift for new users. + +#info-box(kind: "conclusion")[ + Nix essentially modifies the #gls("POSIX", long: false) standard by installing + software in unique locations rather than following the shared file structure + described by the #gls("FHS"). This seemingly minor change brings about several + advantageous properties, such as software composition, immutability, + configuration rollback, caching and reproducibility. +] + +Nix provides two principal methodologies that are not mutually exclusive: the +legacy method (\u{00B1}2006) and the relatively newer #emph[Flake] +(\u{00B1}2020) approaches. + +=== Nix legacy method + +The legacy way of using Nix involves defining a `default.nix` file that is +similar to a function definition in the Nix programming language. This file +contains a set of inputs, specifies dependencies, the build command and its +output. By default, this method does not enable pure evaluation mode, meaning +the hermeticity of the build process is not guaranteed. As a result, potential +uncontrolled side effects may occur during the build process. For instance, as +demonstrated in @ch3-default-nix at line 2, we manually enforce a very specific +version of the `pkgs` variable, a specific snapshot of the Nix package +repository that fixes the versions of all packages and libraries. Similarly to +the process outlined in @ch3-docker-build-env for Docker, this approach, known +as "dependency pinning," ensures consistency and reproducibility in the build +environment. + +#figure( + { + set text(size: .85em) + sourcefile( + file: "default.nix", + lang: "nix", + read("../../lib/scenario-4/default.nix"), + ) + }, + caption: [The Nix build file (`default.nix`) from Scenario 4], +) + +#figure( + { + shell(read("../../resources/sourcecode/scenario-4.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Building the C sourcecode with Nix in Scenario 4], +) + +=== Nix Flake + +Nix #emph[Flake] introduces a structured approach to managing Nix projects, +focusing on reproducibility and ease of use. Currently in an experimental phase, +Flake is anticipated to transition to a stable feature soon due to increasing +community endorsement (@ch3-flake-vs-legacy) and the tangible reproducibility +advantages it offers. + +#figure( + image("../../resources/images/flake-vs-legacy.jpg"), + caption: [On the left, new repositories containing a `flake.nix` file, and on + the right, containing a `default.nix` file + (#link("https://x.com/DeterminateSys/status/1794394407266910626")[Determinate System]) + ], +) + +Flakes aim to simplify and enhance the Nix experience by providing an immutable, +version-controlled way to manage packages, resulting in significant improvements +in reproducibility and build isolation. Flakes manage project dependencies +through a single, top-level `flake.lock` file, which is automatically generated +to precisely pin the versions of all dependencies, including transitive ones, as +specified in the `flake.nix` file. This file ensures project consistency and +reproducibility across different environments. + +In addition to altering the Nix command-line syntax, Flakes enforce a specific +structure and entry point for Nix expressions, standardising project setup and +evaluation. They enable pure evaluation mode by default, which enhances the +purity and isolation of evaluations, making builds more consistent and reducing +side effects. For instance, making external requests during a build is not +possible with Flakes, ensuring that every dependency must be explicitly +declared. Flakes require changes to be tracked through `git`, enabling the exact +reversion of the project to be pinned in the `flake.lock` file. + +The files `flake.nix` and `flake.lock` are complementary and central to the +locking mechanism that ensures reproducibility. Together, when committed in a +project, they guarantee that every user of a Flake, regardless of when they +build or deploy the project, will use the exact same versions of dependencies, +thereby ensuring that the project is built consistently every time. However, it +is possible to have only a `flake.nix` file without a `flake.lock` file. In +such cases, having a reproducible build environment is not guaranteed since +dependencies could drift to newer versions. + +#figure( + { + sourcefile( + file: "flake.nix", + lang: "nix", + read("../../lib/scenario-5/flake.nix"), + ) + }, + caption: [The Nix Flake file (`flake.nix`) from Scenario 5], +) + +#figure( + { + shell(read("../../resources/sourcecode/scenario-5.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Building the C sourcecode with Nix flake in Scenario 5], +) + +#heading(outlined: false, level: 3, "Reproducibility In Time") + +In @ch3-default-nix-build, we notice on line 5 and 11 that building twice the +sourcecode using Nix's legacy method produces the same output. In +@ch3-nix-flake-build, on line 4 and 9 we notice the same thing. +This is therefore classified as reproducible in time. + +#heading(outlined: false, level: 3, "Reproducibility In Space") + +Just like Guix, building the program in a different environment with the same +architecture (`x86_64-linux`) resulted in identical output. Compiling the source +code on another architecture (`aarch64_darwin`) also produced consistent +results, though different from those obtained on `x86_64-linux`. Therefore, we +can conclude that the program is reproducible across different environments, +#emph[modulo] the hardware architecture. + +#heading(outlined: false, level: 3, "Reproducibility Of The Build Environment") + +The reproducibility of the build environment is heavily controlled. The +dependencies are locked and pinned, it is simply not possible to +create a different build environment. + +=== Dealing With Variability + +This section will focus on how Nix deals with unstable outputs, highlighting how +they have abstracted this issue behind the scenes. The scenarios that will be +used are: + +- Scenario 6: Building an #gls("OCI") image with Nix +- Scenario 7: Compiling a Typst document tp a PDF file +- Scenario 8: Compiling a Typst document to a PDF file with Nix, showing how Nix + abstracts the issue of non-deterministic builds. +- Scenario 9: Compiling a Typst document with Nix, fixing the issue of + non-deterministic builds. + +#info-box[ + Typst #cite(, form: "normal") is an advanced markup-based typesetting + language that compiles to #gls("PDF") or #gls("SVG"). It was initiated in 2019 + at the Technical University of Berlin by Laurenz Mädje and Martin Haug. + Developed in Rust, this programmable markup language for typesetting became + the subject of their master's theses, which they wrote in 2022. After several + years of closed-source development, Typst was open-sourced and released to the + public in 2023. Despite being relatively recent and lacking a stable version, + Typst's maturity has allowed it to be used for writing this master's thesis. +] + +Building #gls("OCI") images using Docker is a common use case in the software +development process. However, the output of the build can be non-deterministic +due to the nature of the build process. In scenario 6, we will build an +#gls("OCI") image using Nix only. + +#figure( + { + sourcefile( + file: "flake.nix", + lang: "nix", + read("../../lib/scenario-6/flake.nix"), + ) + }, + caption: [ + The Nix Flake file (`flake.nix`) to build an OCI image in Scenario 6 + ], +) + +#figure( + { + shell(read("../../resources/sourcecode/scenario-6.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Building an #gls("OCI") image with Nix], +) + +In @ch3-nix-flake-container-build, line 5 and 11, we notice that building twice +an #gls("OCI") image using Nix produces the same output. The Flake file in +@ch3-flake-nix-container shows that it is possible to create reproducible +#gls("OCI") containers with Nix, in a simple and declarative way. + +In scenario 7, we will compile a trivial Typst document. + +Consider the following Typst document on the left, and it's rendering on the +right: + +#grid( + columns: 2, + rows: 1, + column-gutter: 1em, + align: bottom, + figure( + { + sourcefile( + file: "hello-world.typst", + lang: "typst", + read("../../lib/scenario-7/src/hello-world.typst"), + ) + }, + caption: [Typst document], + ), + [ + #figure( + box(stroke: .6pt + luma(200), radius: 3pt)[ + #image("../../resources/images/hello-world.svg") + ], + caption: [Rendering of the Typst document], + ) ], +) + +@ch3-hello-world-typst-build-log shows that manually compiling the same document +twice yields different resulting files. + +#figure( + { + shell(read("../../resources/sourcecode/scenario-7.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Manually compiling a Typst document to a #gls("PDF") document in Scenario 7], +) + +While viewing the resulting #gls("PDF") files side by side, we notice that they +appear totally identical to @typst-hello-world-rendered. However, the checksum +of those files are different. This discrepancy is common, where the same input +can produce different outputs due to non-deterministic behaviour in the build +process. Even if the resulting outputs are identical, there can be internal +differences. Therefore, given an arbitrary build output, it is impossible to +determine if a build is valid or not. It is important to acknowledge that tools +like Guix or Nix address this issue by ensuring that the build environment only +is consistent and reproducible. In @ch3-nix-typst-flake, we will show how to +compile the same Typst document using Nix and how to eventually fix the +discrepancy. + +#figure( + { + sourcefile( + file: "flake.nix", + lang: "nix", + read("../../lib/scenario-8/flake.nix"), + ) + }, + caption: [ + The Nix `flake.nix` file to build a Typst document to a PDF in Scenario 8 + ], +) + +Compile it twice and observe the outcome: + +#figure( + { + shell(read("../../resources/sourcecode/scenario-8.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Building a Typst document in Scenario 8], +) + +At lines 4 and 7 of @ch3-hello-world-typst-build, we notice that compiling +twice a Typst document with Nix produces two different #gls("PDF") files, their +respective checksums are different. While the visual output appears identical, +the underlying files are not. At line 3 of @ch3-hellow-world-typst-rebuild, we +leverage a command with specific flags to verify if a build output is +reproducible. + +#figure( + { + shell(read("../../resources/sourcecode/scenario-8-rebuild.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Checking if a build output is reproducible], +) + +Nix will build the document once (line 2), then a second time (line 3) and then +compare the output hashes. Thanks to the `--keep-failed` argument, we inform Nix +to keep the failed builds so we can do a more introspective analysis of the +issue and try to find the root cause of the discrepancy, for example, using +`diffoscope` #cite(, form: "normal") in +@ch3-hello-world-typst-rebuild-diffoscope. + +#figure( + { + shell(read("../../resources/sourcecode/scenario-8-diffoscope.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Checking discrepancies between two builds using `diffoscope`], +) + +#figure( + image("../../resources/images/diffoscope-typst.svg"), + caption: [ + A visual comparison with `diffoscope` of two #gls("PDF") files generated + from the same Typst document + ], +) + +`diffoscope` visually compares the discrepancy between the two #gls("PDF") +files. From the report in @ch3-nix-typst-diff, the highlighted difference seems +to be the creation date metadata. Doing a quick search on @typstdoc confirms +that Typst is able to change the creation date of the output file. +@ch3-nix-typst-flake-fixed implements the trivial change at line 1: + +#figure( + { + sourcefile( + file: "hello-world.typst", + lang: "typst", + read("../../lib/scenario-9/src/hello-world.typst"), + ) + }, + caption: [On line 1, the Typst document date is now set to `none`], +) + +#figure( + { + shell(read("../../resources/sourcecode/scenario-9-rebuild.log")) + }, + supplement: "Terminal session", + kind: "terminal", + caption: [Checking if compiled Typst document is reproducible in Scenario 9], +) + +Now we notice that running the command to check if the output is reproducible +returns nothing, meaning that the output is fully reproducible. + +#info-box[ + Often, raising an issue with the upstream project is the most effective method + for informing the authors about a problem and monitoring its resolution. In + the case of Typst, an issue + #cite(form: "normal", ) was documented to + describe the problem, and in less than two weeks, it had been addressed and + resolved. Consequently, the discrepancy in @ch3-hello-world-typst-build is no + longer applicable for Typst versions newer than `0.11.0`. +] + +== Conclusion + +In this concluding section of the chapter, a summary of the reproducibility +assessment can be found in @ch3-table-conclusion. Following the table, this +section provides a detailed explanation of our categorization process, outlining +the specific criteria used for classifying. Each classification is justified +based on the results obtained from our comprehensive empirical evaluation +process. + +#figure( + include "../../resources/typst/ch3-table-conclusion.typ", + caption: [Software evaluation comparison], + kind: "table", + supplement: [Table], +) + +In evaluating the reproducibility of various tools and methodologies within, a +particular focus has been set on the bare compilation method (@ch3-tool1). This +approach, characterised by its reliance on the host operating system's installed +software for compiling source code into executable programs, presents a nuanced +challenge to reproducibility. Theoretically, bare compilation allows for a +straightforward reproduction of computational results, assuming a static and +uniform environment across different computational setups. However, the +practical application of this method exposes inherent vulnerabilities to +environmental variability. The reliance on the host's installed software means +that the exact version of compilers, libraries, and other dependencies can +significantly impact the outcome of the compilation process. These elements are +seldom identical across different systems or even over time on the same system, +given updates and changes to the software environment. Consequently, the +reproducibility promised by Bare compilation is compromised by these external +variables, which are often not documented with sufficient rigor or are outside +the user's control. Acknowledging these challenges, we categorise the bare +compilation (@ch3-tool1) as non-reproducible by default, reflecting a practical +assessment rather than a theoretical limitation. The classification underscores +the significant effort required to document and manage the dependencies on the +host's software to achieve a reproducible build process. This perspective is +supported by the literature #cite(, form: "normal"), which advocates +for standardising and simplifying the management of computational research +artefacts. The classification of the method 1 (@ch3-tool1) as *non-reproducible* +is a pragmatic acknowledgment of the difficulties presented by the dependency on +the computational environment. + +Docker and similar containerization technologies (@ch3-tool2) can facilitate +reproducible environments. The reason is that while they provide a high degree +of isolation from the host system, they are still subject to variability due to +the base images and package managers used within the containers. This +variability, however, can be effectively managed with low effort. By +meticulously selecting and managing base images and dependencies, it is indeed +feasible to elevate Docker from partially to fully reproducible. For these +reasons, they are categorised as *partially reproducible*. + +Nix (@ch3-tool3) and Guix (@ch3-tool4) provide a high level of control over +the build environment and dependencies, facilitating deterministic and +reproducible builds across different systems. By capturing all dependencies and +environment specifics in a declarative manner, Nix and Guix offer a reliable and +transparent approach to software development. The functional deployment model +implemented by Guix, Nix and their forks (like @lix), along with their +transactional package upgrades and rollbacks, further enhances reproducibility +by enabling exact replication of software environments within the same +architecture at any point in space and time.Under the hood, they introduces a +novel approach to addressing the challenges of reproducibility. By using a very +specific storage model, they ensures that the resulting output directory is +determined by the hash of all inputs. This model, while not guaranteeing bitwise +identical binaries across all scenarios, especially across different hardware +architectures, ensures that the process and environment for building the +software are reproducible. Nix and Guix's model represents a significant step +forward in mitigating reproducibility challenges within #gls("SE"). By ensuring +that every build can be traced back to its exact dependencies and build +environment, it enhances the reliability of software deployments. This approach +is particularly beneficial in #gls("CICD") pipelines, where consistency and +reliability are paramount. Achieving reproducibility in #gls("SE") is filled +with challenges, from architecture dependencies to non-determinism in compilers. +These solutions offers a compelling solution by ensuring reproducible build +environments. The exploration of the concepts used in Guix and Nix, and its +methodologies provides valuable insights into the complexities of software +reproducibility and the necessity for continued research and development in this +field. They both are categorised as *reproducible*. diff --git a/src/thesis/4-conclusion.typ b/src/thesis/4-conclusion.typ new file mode 100644 index 0000000..5bf93d4 --- /dev/null +++ b/src/thesis/4-conclusion.typ @@ -0,0 +1,672 @@ +#import "imports/preamble.typ": * +#import "theme/template.typ": * +#import "theme/common/titlepage.typ": * +#import "theme/common/metadata.typ": * +#import "theme/disclaimer.typ": * +#import "theme/leftblank.typ": * +#import "theme/acknowledgement.typ": * +#import "theme/abstract.typ": * +#import "theme/glossary.typ": * +#import "theme/infos.typ": * + +#chapterquote( + title: "Conclusion", + ref: "chapter4", + quoteAttribution: , + quoteText: [ + Reproducibility has the potential to serve as a minimum standard for judging + scientific claims when full independent replication of a study is not + possible + ], +) + +== Summary + +This thesis embarked on an in-depth exploration of reproducibility in +#gls("SE"), motivated by the growing necessity for reliable and repeatable +results in research and software development. In this chapter, we summarise our +findings and discuss the implications of our research. We also suggest future +work that could be done to improve reproducibility. + +In the introductory @chapter1, I provided an overview of my personal journey and +the experiences that have led me to pursue this specific area within #gls("SE"). +This chapter offered readers contextual information about my background, +especially highlighting my active involvement in open-source communities and my +dedicated advocacy for the adoption of reproducibility tools and practices. I +delved into my motivations for choosing this topic, underscoring the critical +importance of reproducibility in #gls("SE"). The relevance of establishing +reliable and repeatable processes to enhance the integrity of software products +and to foster a culture of transparency and collaboration. To guide the reader +through the thesis, I recall the goals and a structured overview of the +document, chapter by chapter. + +In @chapter2, the theoretical foundations of reproducibility were explored, +tracing its origins from classical scientific disciplines where it has long been +a cornerstone for validating experimental findings and theories. The chapter +began with a historical overview, highlighting how reproducibility emerged as a +fundamental principle in the natural sciences and the shifts it has undergone +with the advent of the digital era. Following this introduction, a comprehensive +set of concepts central to understanding reproducibility in #gls("SE") was +presented. Formal definitions for key terms were introduced, establishing a +rigorous and foundational basis for subsequent developments. The narrative then +extended into the realms of open source and software security, pivotal areas +where reproducibility intersects with broader concerns. Open source software, +with its ethos of transparency and collaboration, enhances reproducibility by +making the source code readily available. This transparency facilitates the +verification of software builds and improves security. With the source code +accessible and inputs correctly declared, it allows for immediate identification +of dependencies and quicker identification of vulnerabilities. The focus shifted +to the specific challenges that software engineers and researchers face in +achieving reproducible builds. This discussion delved into obstacles such as +non-deterministic build processes, variability of environments, and the lack of, +to some extent, standard practices for documenting and sharing the necessary +details to replicate software builds. By examining these challenges, the +groundwork was laid for identifying effective strategies to address them. +Lastly, it introduced third-party tools designed to compare build outputs. + +In @chapter3, I systematically explored and assessed different tools and +strategies used to build software with an emphasis on their potential to +facilitate reproducible results. The chapter presented a comprehensive analysis +of four distinct strategies: Bare compilation, Docker, Guix, and Nix. Each +strategy was evaluated through multiple objective criteria. This approach allows +for a balanced assessment of each method, providing insights into how well they +help in achieving reproducible builds. For Bare compilation, the focus was on +the traditional approach of building software directly on the system without +containerization or virtual environments, highlighting its limitations and +potential issues in reproducibility. Next, Docker was evaluated as a popular +containerization technology that encapsulates the software environment, aiming +to enhance reproducibility by isolating dependencies. Following Docker, I +examined Guix and Nix, two functional package managers that offer more granular +control over environment configurations. The chapter concluded with a +comparative analysis that ranked these strategies according to their +reproducibility potential. This evaluation serves as a resource for developers +making decisions about which tools and strategies to implement in their projects +to enhance reproducibility. + +In the remaining sections of this conclusion chapter, we will summarise the key +findings and the broader implications of my research. Additionally, I will +expand the discussion by exploring other facets of reproducibility. Finally, I +propose future research directions aimed at enhancing reproducibility. + +== Evaluation Of Tools + +@chapter3 was dedicated to evaluating different tools and strategies for +building software with an emphasis on their potential to facilitate reproducible +results. + +Bare compilation is the traditional approach to building software directly on +the system without containerization or virtual environments. This method offers +minimal isolation and control over the build environment, making it challenging +to achieve reproducibility. However, it is still widely used in practice due to +its simplicity and familiarity, usually with the underlying operating system +ecosystem. Debian based Linux distributions are a common choice for this method +due to their extensive package repositories and long-term support, therefore, +the usage of the package manager `apt` became a standard practice for installing +dependencies when one is missing. This imperative method for installing missing +dependencies can lead to non-reproducible builds due to the lack of version +pinning. The `apt` package manager does not provide snapshots where all the +dependencies are frozen at a given state, which makes it challenging to +reproduce the exact build environment. However, it has the advantage of being +popular and widely supported. + +Using Docker for building software is a popular choice due to its increasing +popularity and the ease of creating shareable containers through Docker Hub +#cite(, form: "normal"). However, sharing a container as a single +#gls("OCI") file requires a bit more work and is not as straightforward as +sharing a Docker image on their dedicated platform Docker Hub. In a way, thanks +to Docker, users have been introduced to the concept of containerization, +immutability, and to some extent, reproducibility. + +While discussing Docker with people, I noticed a common misconception about +reproducibility that is worth noting. To illustrate this, let's consider a +project shipping builds of their open-source software through Docker images. At +each release, they publish a new version of their image. These Docker images are +immutable, and users can use and reuse them at will. However, it is simply +impossible to reproduce those images themselves from the sources. While this +illustrates the Docker leitmotif #emph["build once, use everywhere"], it does +not demonstrate true reproducibility. The essence of reproducibility lies in our +ability to recreate identical copies of these images from the sources on any +machine. If something can be reproduced multiple times but yields different +results each time, it is not truly reproducible. Similarly, if something is +produced only once and is not meant to be reproduced, it is, to some extent, +also not reproducible. + +#info-box[ + In #emph[declarative configuration management] + #cite(,form:"normal"), tools such as Docker, Kubernetes + #cite(,form: "normal"), and Terraform + #cite(,form: "normal") are used to specify the desired end state + of the system rather than the steps to achieve it. For example, a `Dockerfile` + describes the final environment for a container, ensuring that the system + matches predefined specifications. This method aligns with the #emph[congruent + system management] #cite(,form:"normal") approach, focusing on + consistency and predictability. Declarative configurations ensure idempotence, + meaning the same configuration can be applied multiple times without altering + the system beyond its intended state. This abstraction makes it easier to + understand and maintain, as the system determines the necessary actions to + achieve the desired state. + + In contrast, #emph[imperative configuration management] + #cite(,form:"normal")involves detailing the exact steps required to + transition a system from its current state to a desired state, providing + granular control over the configuration process. Tools such as Ansible + #cite(, form: "normal"), Chef #cite(, form: "normal"), Puppet + #cite(, form: "normal"), and Bash scripts exemplify this approach. + This method aligns with the #emph[convergent system management] + #cite(,form:"normal") approach, focusing on achieving a goal + through a series of specific actions. While imperative configurations allow + for complex logic and conditional operations, they can be challenging to + maintain due to their non-idempotent nature, meaning the same script can + produce different results depending on the system's initial state. This + approach requires meticulous management to ensure consistency and + repeatability, offering detailed control at the expense of simplicity and + predictability. + + The expressiveness of imperative tools comes at a significant cost. These + tools allow developers to make stronger assumptions about the current state of + the system. This creates a strong likelihood of like environments diverging + over time in a process known as #emph[configuration drift]. The declarative + approach to configuration management reduces the possibility of configuration + drift by favoring idempotence, explicit dependency graphs, and maintaining a + strong awareness of the current state of the environment + #cite(,form:"normal", supplement: [p.348]). +] + +During the evaluation, I found Docker to be very easy to use. By using a simple +declarative syntax to define the build steps, the `Dockerfile` is one step +forward into making configurations more explicit and this has contributed a lot +to the success of the Docker ecosystem. On a less positive note, using Docker on +other platforms than Linux can be challenging and a deal breaker for some users. +On non-Linux systems, Docker relies on a virtual machine to create a Linux +environment, which can lead to performance overheads and latency issues. The +differences in filesystems between Linux and non-Linux platforms can also result +in inconsistencies and unexpected behaviour in container operations. +Additionally, networking configurations and capabilities can vary significantly, +causing more complex setups and potential connectivity issues. Resource +allocation and management can be less efficient on non-Linux platforms due to +the intermediary VM layer. On Linux based architectures, the performance are not +as good as running the software natively, but it is still acceptable for most +use cases. However, while initiatives such as DevContainer +#cite(, form: "normal") are trying to provide a more integrated +experience with Visual Studio Code #cite(, form: "normal"), working with +and inside a container adds an extra layer of complexity that can be challenging +to manage, especially when dealing with networking, storage and security. + +Guix has been an interesting tool to evaluate. While the learning curve is +steeper than Docker, the benefits are significant. I appreciated the strict and +declarative approach to package management, which aligns well with the +reproducibility goals. The idea of using an existing general purpose language +for declaring packages and configurations is a powerful idea. The community is +small but active, however since no proprietary tools are packaged, it can be a +challenge for users to find the software they need. There are workarounds +existing but it is not advertised by the Guix community which tend to focus and +adhere to the free software philosophy #cite(, form: "normal"). The +performance of Guix is great, since no containerization is involved, the +software runs natively on the system and accessing storage and network is a +breeze. Guix extensively uses `git` #cite(, form: "normal") for fetching +packages and configurations, and the information displayed to the user while +running it is very clean and clear. + +Nix has been the most interesting approaches to evaluate, technically but also +politically. The learning curve is steep, but the benefits are significant. +The Nix language while being Turing-complete (@def-turing-complete) has a very +specific and limited #gls("DSL"), it is remarkably clean and powerful, making it +highly suitable for managing package builds and configurations. Additionally, +the simplicity of the Nix language enhances its efficiency and usability, +positioning it, in my own opinion, as one of the best languages for this task. + +During the making of this thesis, I contributed to Nix and I really appreciated +how easy it is to contribute but also the transparency of the process when it +comes to making a change. The Nix community is very large, active and welcoming, +and the Nix package repository `nixpkgs` is one of the most active repository +on Github #cite(, form: "normal"). Nix has completely changed the +way I think about software management, how I consume software and how I ship +software. I wish I had discovered it earlier. + +While Nix offers many advantages, it also has some drawbacks, primarily +concerning its installer. The Nix installer is a shell script that downloads and +installs Nix on the system. However, there are alternative installers available. +During my evaluation, I explored these different installers and ultimately chose +one developed by a United States-based company founded by Eelco Dolstra, the +creator of Nix. Given this connection, I felt confident using their installer. +Nevertheless, I found it peculiar to maintain multiple installers for the same +software, which I suspect might be a source of confusion for many users, both +new and experienced. Could this indicate a deeper governance problem? + +Another drawback of Nix is the documentation. While the Nix manual is extensive +and well-written, it can be overwhelming for new users. The manual is +comprehensive but it lacks a clear and concise structure, examples and a lot of +topics are not covered. A lot of energy, initiative and effort are being made +and things are slowly moving in the right direction. + +While working with Nix the first time, I got introduced to the concept of Nix +channels. Nix channels is a mechanism used to distribute and update collections +of Nix expressions. These channels provide users with a way to receive updates +for Nix packages and configurations. While Nix extensively uses `git`, the +unique concept of channels adds an extra cognitive load for users who want to +simply upgrade their machines. I preferred the simplicity Guix offers by just +using `git` to update the system. When using Flakes, the concept of Nix channels +is no longer needed. + +Flakes is an experimental feature as of writing. Released in November 2021 with +Nix 2.4 #cite(, form: "normal"), Flakes is powerful yet +controversial. For example, some companies, including one founded by the creator +of Nix, are advocating for the adoption of Flakes, while the Nix community +awaits its stabilization. Although these differences do not affect Nix's +functionality in the short term, they can be confusing for new users and may +lead to fragmentation within the community. One might ask why such companies are +not contributing to the same codebase as the rest of the community, a situation +that has already led to some division. + +#info-box(kind: "important")[ + There are currently around 230 committers spread across the globe taking care + of the Nix package repository on Github + #cite(,form:"normal"). On June 1st 2023, I've been granted + the status of project committer #cite(,form:"normal"). + This status allows me to merge commits, review code, and contribute to the Nix + ecosystem in a more direct way. However, I want to clarify that the + conclusions of this thesis were not influenced by my role in the Nix project. + I have worked to maintain the highest level of objectivity in this document, + with only reproducibility as the primary focus. +] + +At the time of writing, the Nix community was facing a significant crisis, +leading to the creation of two new forks of Nix, @lix and @aux. This +fragmentation is a major concern for the project's future. While a new +governance structure is being established, the community remains divided over +different installers, the experimental Flake feature, the sponsorships policy +and now forks. Although the current situation is not ideal, I am confident that +the community will overcome this crisis and continue to provide an excellent +tool for the #gls("SE") community. The controversial Flake feature has attracted +many new users to the Nix ecosystem, and the community is growing rapidly. These +issues are likely a result of this rapid growth and the initial lack of clear +and transparent governance, a problem that needs to be resolved. + +To conclude, it is essential to recognise that Nix is the result of extensive +research and development, used in production by numerous companies and +individuals. The Nix community is dynamic and vibrant, promising a bright future +for the project. The core value of Nix lies in its package repository, +`nixpkgs`, which hosts thousands of packages readily accessible to any Nix user. +Regardless of which Nix variant or fork one chooses, the true asset remains that +package recipe repository, likely to be shared across different forks, ensuring +consistency and reliability in package management. + +#info-box(kind: "conclusion")[ + I have concluded that the ideas implemented by Nix stand out as the optimal + choice. With two decades of maturity and robustness, the Nix ecosystem is, in + my view, currently the best concept for implementing reproducibility in + #gls("SE"). I am convinced that Nix, or a similar technology that embraces the + same principles (e.g., @guixwebsite, @lix, @aux), has the potential to + revolutionise the way software is built, used, audited, deployed and shared. +] + +#pagebreak(weak: true) + +== Research Findings + +During this research, I have discovered in @chapter3 that the reproducibility of +software builds is a multifaceted challenge that requires a combination of +skills, tools and strategies to address it effectively. In this section, I +present a summary of the key findings and some other facets of reproducibility +that I briefly explored, but could be expanded in future research. + +=== At A Glance + +Reproducibility in #gls("SE") refers to the ability to consistently recreate the +same software product, with identical functionality, using the same methods and +data across different environments and over time. This involves +ensuring that the software build process, dependencies, and computational +environments are well-documented and standardised. + +#figure( + include "../../resources/typst/ch4-table-conclusion.typ", + caption: [Pros and cons of reproducibility], + kind: "table", + supplement: [Table], +) + +=== Limitations + +Each of the tools evaluated in @chapter3 offers unique advantages and features, +yet they also possess limitations that can impact their effectiveness in +achieving reproducibility. Below are some key points noted during the +evaluation. + +==== Cross Architecture Reproducibility + +Achieving software build reproducibility across different operating systems and +architectures is not feasible for certain types of build outputs, typically +binaries (@ch2-r13y-utopia). Binary build outputs depend on the #gls("CPU") +architecture (e.g., `x86`, `ARM`) because converting source code into machine +code uses a set of #gls("CPU") instructions directly inherited from the +#gls("CPU") architecture. Therefore, obtaining binaries that are identical +across every architecture is unlikely to occur. This inherent dependency means +that the binaries produced on different architectures will have variations, +making exact cross hardware architecture reproducibility unattainable. + +==== Unavailability Of Upstream Packages + +To illustrate this limitation, I will use the example of a software package that +has been removed from the upstream repository and is no longer available for +download. In this scenario, the software package cannot be built, as the source +code is no longer accessible. Furthermore, any other packages that depend on +this now-unavailable package also become impossible to build. This limitation +highlights the importance of maintaining a robust and reliable infrastructure +for software repositories to ensure the longevity of software packages and +facilitate reproducibility. In the absence of such infrastructure, +reproducibility becomes increasingly challenging, as software packages may +become obsolete or unavailable over time. + +To circumvent this limitation, researchers and developers can adopt proactive +measures to ensure the reproducibility of their software builds. One approach is +to archive the source code and dependencies of the software package, preserving +them in a secure and accessible repository. This is what projects like Software +Heritage #cite(, form: "normal") is trying to achieve. By archiving the +source code and dependencies, researchers and developers can safeguard against +the loss of critical software components and maintain the reproducibility of +their builds over time. Additionally, implementing a caching layer to store +build outputs can significantly enhance reproducibility. This allows users to +retrieve precompiled build outputs, thereby avoiding the need to compile the +source code on their machines if the corresponding cached build exists. +Nix facilitates the creation of such cached build layers due to its principles +(@def-functional-package-management), as it produces immutable directories based +on sources. This means that modifying existing cached builds is not possible, +mitigating potential security issues related to accidental modifications. It's +worth noting that this level of immutability and reproducibility is not the case +with all package managers. + +==== Standardisation + +Another limitation is the lack of standard practices for documenting and sharing +the necessary details to replicate software builds. This limitation underscores +the need for clear and comprehensive documentation to facilitate +reproducibility. An exemplary initiative addressing this challenge is the +#gls("PURL") project, which seeks to standardise the identification and location +of software packages across various ecosystems and tools. The #gls("PURL") +specification #cite(, form: "normal") provides a universal syntax to +reliably reference software packages, thereby mitigating the inconsistencies +that arise from diverse package management conventions. By establishing a common +framework, #gls("PURL") enhances the interoperability and reproducibility of +software builds across different platforms and tools, illustrating the critical +role of standardisation in reproducible research and development. In addition to +#gls("PURL"), the #gls("SWHID") is another significant development aimed at +improving standardisation. The #gls("SWHID") provides a unique, persistent +identifier for software source code, facilitating the precise identification and +retrieval of specific software versions from the @SWHArchive. + +To illustrate this, the 11 June 2024, GitHub announced +#cite(,form:"normal"), that generated #gls("SBOM") files will +now include a #gls("PURL"). By including the #gls("PURL"), GitHub improves the completeness of the #gls("SBOM") data, helping users in more clearly identifying +the packages in their repositories. This new Github feature exemplifies the +practical benefits of adopting a standardised specification, as it addresses a +critical need in reproducibility by providing the precise identification of +software components used in a build, thereby improving transparency and +reproducibility in software development through the inclusion of #glspl("PURL") +in GitHub's #glspl("SBOM"). + +== Future Work + +The exploration of reproducibility in #gls("SE") is an ongoing endeavour. As +technology advances and new tools emerge, the landscape of reproducibility +continually evolves. + +A foundational step towards enhancing reproducibility in #gls("SE"), and by +extension, in the broader realm of science, is to raise awareness of its +importance from the outset. This can be achieved through educational +initiatives, workshops, and seminars that highlight the benefits of +reproducibility and provide researchers with the necessary tools and resources. +Embedding reproducible practices into the research culture from the beginning +will help establish these practices as standard requirements rather than +optional enhancements​​​​. + +On the technical side, the frameworks and tools evaluated in this thesis provide +a robust foundation for reproducible software builds. However, significant scope +for improvement remains. Currently, various Linux distributions continue to +develop their own package managers, resulting in redundant efforts and +inefficient use of resources. While this idea seems utopic, adopting Nix as a +#emph[universal package manager] could be a potential solution. Nix abstracts +away the underlying system, making it an ideal candidate for standardizing +software deployment across different Linux distributions. By providing a +consistent environment, Nix could streamline the deployment process, reduce +inconsistencies, enhance reproducibility across diverse systems, and improve +security. With a universal package manager, security vulnerabilities could be +addressed more efficiently, as fixes could be deployed across all systems +simultaneously. However, implementing Nix universally presents several +challenges, such as ensuring compatibility with all distributions, overcoming +resistance from communities accustomed to their current systems, and managing +the scalability and maintenance of such an initiative. To address these +challenges, a phased approach could be adopted, starting with specific use cases +or distributions where Nix has demonstrated clear benefits. Additionally, +collaborative efforts and open dialogue among stakeholders could facilitate a +smoother transition. Furthermore, adopting Nix could significantly reduce our +carbon footprint by eliminating the need to store prebuild binaries for +different distributions. Instead, binaries would be prebuilt once, then stored +and made available on a #gls("CDN") for all the Linux distributions, thus +streamlining the deployment process and contributing to environmental +sustainability. + +#info-box[ + The complexity of reproducibility is comparable to that of replicating a + painting. While explaining this thesis to a painter, I used the example of + creating an indistinguishable copy of another painting. The painter explained + that the likelihood of achieving such perfect replication is comparable to the + chance of a monkey writing Shakespeare's work due to the numerous variables + involved, such as the type of paint, the brush, the canvas, the lighting, the + environment, the painter's mood, and the time of day. This carefully chosen + analogy underscores the multifaceted nature of reproducibility in #gls("SE"), + where numerous variables and intricate interplay influence the final outcome​​. +] + +The challenges in achieving reproducibility in artistic works highlight the +complexity and necessity of considering various factors. This broader context +emphasises that reproducibility is not limited to #gls("SE") but is a universal +issue that requires ongoing attention and innovation. Across different fields +such as the arts, social sciences, and natural sciences, achieving +reproducibility involves addressing a wide array of considerations. +Specifically, these considerations include ethical, economic, philosophical and +educational aspects. + +=== Flaky tests + +Flaky tests are tests that exhibit inconsistent outcomes without changes to the +code being tested. This means that they can fail or pass sporadically, leading +to uncertainty and mistrust in the test outcomes. These unreliable tests are not +only problematic for developers but also hinder the effectiveness of valuable +techniques in software testing research. Essentially, flaky tests pose a threat +to the validity of methodologies that rely on the assumption that a test's +outcome is solely determined by the source code it evaluates. From a recent +paper #cite(,form:"normal"), a survey of software developers +found that 59% claimed to deal with flaky tests on a monthly, weekly, or daily +basis. + +Reproducibility is directly linked to the issue of flaky tests because their +inconsistency directly impacts the ability to reproduce results reliably. For +effective reproducibility in scientific software and other domains, it is +crucial that tests yield consistent and predictable results. Unstable tests +hinder the verification process, making it difficult to assert whether observed +issues are due to actual code defects or just the flakiness of the tests +themselves. This discrepancy affects validation, verification, and the +confidence in computational results. + +Future work on this issue should focus on developing techniques to identify and +mitigate flaky tests, ensuring that test outcomes are consistent and reliable. +Research could explore advanced methods for detecting flakiness, such as machine +learning algorithms that analyze test behavior patterns. Additionally, creating +tools to automatically stabilize flaky tests and integrating these solutions +into continuous integration pipelines would significantly enhance the +reliability and trustworthiness of software testing processes. + +=== Formal Concepts + +There is potential for further describing and refining the formal definitions +related to reproducibility introduced mostly in @chapter2. While this thesis has +introduced formal definitions of key terms related to reproducibility, these +definitions can be expanded, refined and improved. + +Having a set of formal definitions related to reproducibility is important +because they provide clarity and consistency in terminology, helping researchers +communicate more effectively. Standardised definitions allow for consistent +evaluation criteria, making it easier to compare results across different +studies and ensuring reliable assessments. This standardisation also supports +the development of tools and methodologies for verifying reproducibility, making +the evaluation process more rigorous. + +Moreover, formal definitions play an educational role by instilling a culture of +reproducibility among new researchers and students. They guide policy-making and +governance in research institutions, promoting best practices. Ultimately, these +definitions ensure the reliability and integrity of scientific findings. +Enhancing these formal definitions will further strengthen the quality and +credibility of research. + +=== Ethical Considerations + +Reproducibility is fundamentally intertwined with ethical practices in research, +as it bolsters the scientific process by enforcing transparency among +researchers. The commitment to making research reproducible serves the +scientific community and fosters public trust in scientific outcomes, +demonstrating a respect for the integrity of science and its impact on society. + +Ethically, researchers are obliged to report their findings and also provide +comprehensive details of their methodologies. This level of accountability +allows their research to be rigorously scrutinised, validated, or refuted by +peers, thus enhancing the quality and credibility of the scientific knowledge +produced. It is essential for maintaining public trust in scientific +research. When results are not reproducible, it undermines the reliability of +scientific discourse and can lead to scepticism towards scientific claims. This +is particularly critical when scientific research informs policy decisions in +crucial areas such as public health and environmental conservation, where +non-reproducible research could lead to misguided policies with far-reaching +consequences. As we've seen through this document, the open sharing of data and +methods is a cornerstone of reproducible research. It democratises access to +scientific knowledge, enabling a diverse range of researchers to participate in +and contribute to scientific discovery, regardless of their geographical, +political or institutional affiliations. Reproducibility acts as a bulwark +against fraud and bias. It ensures that research content, findings, discoveries +are genuine and not the result of manipulated data, thus promoting fair +distribution of resources and recognition within the scientific community. + +=== Philosophical Considerations + +According to @kpopper1934, a cornerstone of scientific inquiry is that a theory +must be falsifiable; that is, it can be disproven through empirical evidence. +While evidence alone cannot conclusively verify a hypothesis, it can refute one. +Reproducibility is essential in this context as it allows hypotheses to be +rigorously tested and either validated or refuted, thus contributing to the +evolution of scientific truth. + +Scientific knowledge is not static but accumulates iteratively. Reproducibility +fortifies this process by ensuring that each new discovery builds upon a +foundation of previously verified results. This methodological consistency is +crucial for the progressive nature of scientific understanding. + +Reproducibility also underpins the pursuit of objective knowledge. It helps +distinguish robust scientific results from those that are anomalies or artefacts +of experimental error, refining our collective understanding of natural +phenomena. The establishment of scientific consensus relies heavily on +reproducible results. This reproducibility facilitates agreement among +scientists on what constitutes established facts, thus propelling scientific +progress and fostering collaboration across various disciplines. + +Moreover, reproducibility enhances the scientific enterprise by encouraging the +open sharing of data and methods. This openness not only fosters collaboration +but also transforms research into a collective endeavour rather than a series of +isolated efforts. It cultivates a scientific culture where data transparency and +methodological openness are normative, promoting an inclusive environment that +respects and builds upon the work of fellow researchers. By facilitating the +verification of results, reproducibility pays homage to the foundational work of +previous researchers and ensures that their contributions to knowledge are +respected and built upon. It reinforces the integrity of scientific practice and +propels the pursuit of further inquiry. + +=== Economical Considerations + +Reproducibility intersects with economic efficiency. Efficient reproducibility +can accelerate scientific progress by enabling quicker validation of results and +facilitating broader dissemination of knowledge. Economies of scale can be +applied where repetitive reproductions are feasible, thus reducing the unit cost +of research and making large-scale studies more financially sustainable. + +The economic impact of reproducibility also extends to its utility in +policymaking and industrial applications. Reproducible research ensures that +policies and commercial ventures based on scientific findings are underpinned by +robust and reliable evidence, thus minimising risks and maximising efficacy. +This not only bolsters public and investor confidence but also enhances the +economic utility of scientific research. + +To some extent, reproducibility is closely linked to the economy of scarcity, +where the rarity of an object or finding directly impacts its reproducibility +and associated costs. Rare phenomena or data require more specialised resources +for reproduction, which are often costly and less accessible. This scarcity +increases the economic investment required to replicate a study, from securing +rare materials to accessing specialised equipment. + +Conversely, phenomena that are not rare can be reproduced with greater ease and +at a lower cost. The abundance of necessary resources and established +methodologies makes such reproduction economically viable and less +resource-intensive. This disparity highlights a fundamental economic principle +within scientific research: the cost and feasibility of reproducibility often +depend on the availability and accessibility of resources. + +To illustrate this, consider the manual replication of a painting, where the +scarcity of the original artist’s brushstrokes and techniques makes it +challenging to reproduce the artwork with the same precision and quality. It +would take a significant investment of time, effort, and resources to manually +replicate the painting accurately. Choosing the proper materials, mastering the +techniques, and recreating the environment and artist’s vision are all essential +factors that contribute to the cost and feasibility of reproducing the painting. +In contrast, a mass-produced item, such as a digital photograph, can be +replicated with relative ease and at a lower cost. + +In summary, the economic implications of reproducibility encompass a range of +considerations from the broader economic impacts on efficiency, credibility, and +practical application to the costs of rare resources. + +=== Educational Considerations + +Educating students in best practices is crucial for fostering a culture of +reproducibility. We can draw a compelling parallel with mathematics, where +reproducibility is inherently embedded. Just as mathematical proofs and +solutions can be independently verified by anyone following the same steps and +logic, reproducibility in #gls("SE") aims for the same level of +transparency and verifiability. This bridge underscores that reproducibility is +a desirable trait and a foundational principle that should be rigorously applied +in computational research. For example, a professor might provide students with +the necessary data, software, and materials for a specific course through fully +reproducible methods and tools. This could involve using open-source software, +version control systems, and detailed documentation to ensure that students can +reproduce the outcome in any space and any time. + +Incorporating ethics education into research training programmes helps instil +the importance of reproducibility and integrity in scientific research, +emphasising the ethical responsibility researchers have towards producing +verifiable and reliable results. Organising workshops and seminars focused on +reproducibility can help disseminate best practices and foster a community +dedicated to high standards in research. These events serve as platforms for +discussion, collaboration, and the sharing of new tools and techniques. + +Furthermore, experienced researchers mentoring early-career scientists can pass +on valuable knowledge and emphasise the importance of reproducibility in their +work. Mentorship provides hands-on guidance and support, helping to build a +strong foundation for the next generation of scientists. By addressing these +educational and training considerations, we can cultivate a research environment +that values and prioritises reproducibility. + +Reproducibility is closely linked to fact-checking, as both processes involve +verifying the accuracy and reliability of research findings. Emphasising +reproducibility can significantly enhance the quality of fact-checking by +providing clear, transparent methodologies and robust data that others can +independently verify. This rigorous approach ensures the credibility of +scientific research and fosters critical thinking skills. By engaging in +reproducible research practices, individuals develop a keen critical thinking, +which is essential for evaluating information, identifying biases, and making +informed decisions. Teaching the principles of reproducibility and fact-checking +from an early age is crucial. Incorporating these concepts into school curricula +helps students develop critical thinking skills early on, empowering them to +question assumptions, evaluate evidence, understand the scientific process and +verify by themselves. Educating students about the importance of transparency, +data integrity, and methodological rigour lays the foundation for a more +scientifically literate and critically minded society. By fostering these skills +from the beginning of their education, we can equip future generations with the +tools they need to navigate the complex and information-rich world. + +#leftblank(weak: false) diff --git a/src/thesis/abstract.typ b/src/thesis/abstract.typ new file mode 100644 index 0000000..ad1189a --- /dev/null +++ b/src/thesis/abstract.typ @@ -0,0 +1,29 @@ +#import "theme/abstract.typ": * + +#abstract[ + The concept of reproducibility has long been a cornerstone in scientific + research, ensuring that results are robust, repeatable, and can be + independently verified. This concept has been extended to computer science, + focusing on the ability to recreate identical software artefacts. However, the + importance of reproducibility in software engineering is often overlooked, + leading to challenges in the validation, security, and reliability of software + products. + + This master's thesis aims to investigate the current state of reproducibility + in software engineering, exploring both the barriers and potential solutions + to making software more reproducible and raising awareness. It identifies key + factors that impede reproducibility such as inconsistent environments, lack of + standardisation, and incomplete documentation. To tackle these issues, I + propose an empirical comparison of tools facilitating software + reproducibility. + + To provide a comprehensive assessment of reproducibility in software + engineering, this study adopts a methodology that involves a hands-on + evaluation of four different methods and tools. Through a systematic + evaluation of these tools, this research seeks to determine their + effectiveness in establishing and maintaining identical software environments + and builds. + + This study contributes to academic knowledge and offers practical insights + that could influence future software development protocols and standards. +] diff --git a/src/thesis/accessibility.typ b/src/thesis/accessibility.typ new file mode 100644 index 0000000..13d1d8f --- /dev/null +++ b/src/thesis/accessibility.typ @@ -0,0 +1,57 @@ +#import "theme/glossary.typ": * + +#pagebreak(weak: true) + +#heading("Accessibility", level: 1, outlined: false) + +This master thesis has been written with a focus on accessibility to ensure it +can be easily read and understood by a diverse audience, including individuals +with disabilities. The following measures have been taken to enhance +accessibility: + +- Links: all hyperlinks within this document are underlined and clearly + distinguishable from regular text. This visual cue helps users identify + clickable links easily. + +- Symbols and notation: specific symbols and notation have been used + consistently throughout the document to aid comprehension. Mathematical + symbols, special characters, and other notation are presented in a clear and + readable manner. + +- Text formatting: the document uses high-contrast text formatting and font + sizes that are readable across different devices and screen resolutions. The + `New Computer Modern` #cite(,form:"normal") font is used, + chosen for its clarity and readability, especially in mathematical and + technical contexts. + +- Margins: the margins have been alternately adapted to ensure that when the + document is printed, it is suitable for binding and easy to read. This + consideration enhances the physical accessibility of the printed document. + +- Headings and structure: the document is structured with clear headings and + subheadings to facilitate navigation. This hierarchical organisation assists + readers in quickly finding relevant sections. + +- Language and terminology: plain language and concise terminology have been + employed to ensure that the content is comprehensible to a broad audience, + including those for whom English is not their first language. + +- Glossary: a #link()[glossary] is included, containing the + most common abbreviations used throughout the document. This aids readers in + quickly understanding the abbreviations and acronyms, improving overall + comprehension. Additionally, comprehensive lists of + #link()[definitions], #link()[figures], + and #link()[tables] are provided. + +- Accessible file formats: the document is available in multiple file formats, + to accommodate various reading preferences and assistive technologies. + +- Bibliography links: in line with the #gls("IEEE") citation style, numbers in + brackets are used to link references in the + #link()[bibliography]. This method provides a clear and + consistent way to reference sources, enhancing the readability and + accessibility of the document. + +- Images: all images in this document are in #gls("SVG") format. This vectorial + format ensures that images are scalable without loss of quality, providing + clear and accessible visuals on different devices and screen resolutions. diff --git a/src/thesis/acknowledgement.typ b/src/thesis/acknowledgement.typ new file mode 100644 index 0000000..71b763b --- /dev/null +++ b/src/thesis/acknowledgement.typ @@ -0,0 +1,40 @@ +#import "theme/acknowledgement.typ": * + +#acknowledgement[ + First and foremost, I would like to express my deepest gratitude to Professor + Tom Mens. I am incredibly thankful for his availability and guidance + throughout my studies. It was a great honour to receive his proposal to + supervise my research in my final year. Beyond that, he generously + allowed me to suggest topics that piqued my interest, ultimately enabling me + to focus on a subject that I am truly passionate about. + + I must also express my heartfelt appreciation for my girlfriend, Sandra. Her + endless patience and emotional support have been a constant source of + strength, motivation and inspiration for me. However, it is with a touch of + melancholy that I acknowledge the sacrifices we've made in our personal lives + in order to pursue this overdue academic endeavour that I should have + completed twenty years ago. + + I would like to thank my family and #emph[in-real-life] friends for their + continuous support and encouragement throughout these last years. Your belief + in me has provided the foundation upon which this work stands. I am also + deeply grateful to my #emph[online] friends, especially within the Typst and + Nix communities, for their tremendous support and constant source of + solutions, inspiration and motivation. + + A special mention is deserved for Izumi, my cat, who was a constant companion + and source of comfort over the last decade. His loss was a profound sorrow, + and I deeply miss his presence. The memories of the countless hours he spent + by my side, offering silent support during my work, have left an indelible + mark. + + Finally, I would like to express my sincere thanks to all the participants in + this research. I am particularly grateful to my colleagues at European + Commission, who courageously and continuously supported me while remaining + unaware of my academic activities. Your valuable feedback has greatly + contributed to the development of some parts of this master's thesis. In fact, + your lack of awareness helped me understand the barriers to implementing + software reproducibility from the very beginning and in a real professional + context. Each piece of feedback has been instrumental in helping me better + understand and improve communication about this concept. +] diff --git a/src/thesis/disclaimer.typ b/src/thesis/disclaimer.typ new file mode 100644 index 0000000..cd6653c --- /dev/null +++ b/src/thesis/disclaimer.typ @@ -0,0 +1,8 @@ +#import "theme/disclaimer.typ": * + +#disclaimer( + title: title, + degree: degree, + author: author, + submissionDate: submissionDate, +) diff --git a/src/thesis/extra.typ b/src/thesis/extra.typ new file mode 100644 index 0000000..1749ffd --- /dev/null +++ b/src/thesis/extra.typ @@ -0,0 +1,24 @@ +#import "theme/glossary.typ": * + +#heading("Open Source", level: 1, outlined: false) + +This master thesis has been developed exclusively using open-source tools. +Similar to an open-source project, it was maintained on GitHub but in a private +repository #cite(, form:"normal"). That repository will be made +public after the oral defense and necessary internal cleanup. Automated and +reproducible builds were managed via GitHub Actions +#cite(,form:"normal"), ensuring that a new compiled version was +published at each commit to the project. Additionally, I'm planning to publish +it on @ArXiV too after the necessary formalities are completed. + +This work is licenced under a dual license: the #gls("CC BY 4.0") and the +#gls("HL3") licences. You are free to share and adapt the material under the +terms of the `CC BY 4.0`, provided you give appropriate credit to the original +author. You must also use the material in accordance with the ethical guidelines +specified in `HL3`, ensuring it is not used to contribute to human rights abuses +or other unethical practices. In case of any conflict between the licences, +`HL3` will take precedence. + +For the purpose of the @chapter3, an open-source project +#cite(,form:"normal") was created to provide a full +transparency on the results shown in that chapter. diff --git a/src/thesis/glossary.typ b/src/thesis/glossary.typ new file mode 100644 index 0000000..a4908ef --- /dev/null +++ b/src/thesis/glossary.typ @@ -0,0 +1,191 @@ +#import "theme/glossary.typ": * + +#glossary(terms: ( + ( + key: "CC BY 4.0", + short: "CC BY 4.0", + long: "Creative Commons Attribution 4.0 International", + desc: [The Creative Commons Attribution 4.0 International License #cite(,form:"normal") is a widely used license that allows others to distribute, remix, adapt, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most flexible of the CC licenses.], + ), + ( + key: "CDN", + short: "CDN", + long: "Content Delivery Network", + desc: [A content delivery network is a system of distributed servers that deliver web content to a user based on the geographic locations of the user, the origin of the webpage and a content delivery server, making the delivery of content more efficient.], + ), + ( + key: "CICD", + short: "CI/CD", + long: "Continuous Integration/Continuous Deployment", + desc: [Continuous Integration (CI) is a software development practice where developers regularly merge their code changes into a central repository, after which automated builds and tests are run. Continuous Deployment (CD) is a software release process that uses automated testing to validate that changes are safe to deploy to production.], + ), + ( + key: "CPU", + short: "CPU", + long: "Central Processing Unit", + desc: [The CPU is the primary component of a computer that processes instructions. It runs the operating system and applications, constantly receiving input from the user or active software programs. It processes the data and produces outputs. ARM and X86 are two common CPU architectures.], + ), + ( + key: "CRA", + short: "CRA", + long: "Cyber Resilience Act", + desc: [The Cyber Resilience Act #cite(,form:"normal") is a proposed European Union regulation that aims to improve the cybersecurity of digital products and services. It includes provisions for #link()[software supply chain] security, incident reporting, and security certification.], + ), + ( + key: "CS", + short: "CS", + long: "Computer Science", + desc: [The discipline of Computer Science includes the study of algorithms and data structures, computer and network design, modelling data and information processes, and artificial intelligence. Computer Science draws some of its foundations from mathematics and engineering and therefore incorporates techniques from areas such as queueing theory, probability and statistics, and electronic circuit design.], + ), + ( + key: "CycloneDX", + short: "CycloneDX", + desc: [@cyclonedx is an open-format standard baked by the OWASP foundation and Ecma Technical Committee designed to provide comprehensive and interoperable information about the components used within software projects like software bill of materials and advanced supply chain capabilities for cyber risk reduction.], + ), + ( + key: "DevOps", + short: "DevOps", + desc: [DevOps is a set of practices that combines software development (Dev) and IT operations (Ops). It aims to shorten the systems development life cycle and provide #gls("CICD").], + ), + ( + key: "DevSecOps", + short: "DevSecOps", + desc: [ + DevSecOps is an extension of #gls("DevOps") practices that integrates security (Sec) measures at every stage of the software development lifecycle, ensuring that security is a fundamental aspect of development and operations processes. + ], + ), + ( + key: "DSL", + short: "DSL", + long: "Domain Specific Language", + desc: [A domain-specific language is a computer language specialised to a particular application domain. This is in contrast to a general-purpose language, which is broadly applicable across various domains.], + ), + ( + key: "FHS", + short: "FHS", + long: "Filesystem Hierarchy Standard", + desc: [The Filesystem Hierarchy Standard is a reference document that describe the conventions used for the layout of Unix-like operating systems. This includes names, locations, and permissions of many file and directories.], + ), + ( + key: "HL3", + short: "HL3", + long: "Hippocratic Licence 3.0", + desc: [The Hippocratic Licence 3.0 #cite(,form:"normal") is a software license that ensures that software is not used to contribute to human rights abuses or other unethical practices. It is designed to protect users and communities from the potential misuse of software.], + ), + ( + key: "IEEE", + short: "IEEE", + long: "Institute of Electrical and Electronics Engineers", + desc: [The Institute of Electrical and Electronics Engineers #cite(, form:"normal"), established in 1963, is the world's largest technical professional organisation dedicated to advancing technology for the benefit of humanity. It serves as a professional association for electronic engineering, electrical engineering, and related disciplines.], + ), + ( + key: "MD5", + short: "MD5", + long: "Message Digest 5", + desc: [The MD5 message-digest algorithm is a widely used hash function producing a 128-bit hash value. MD5 was designed by Ronald Rivest in 1991 to replace an earlier hash function MD4, and was specified in 1992 as RFC 1321.], + ), + ( + key: "OCI", + short: "OCI", + long: "Open Container Initiative", + desc: [OCI stands for @opencontainerinitiative, an open governance project for the purpose of creating open industry standards around container formats and runtime. An "OCI image" is a container image that conforms to the OCI image format specification.], + ), + ( + key: "OS", + short: "OS", + long: "Operating System", + plural: "OSes", + longplural: "Operating Systems", + desc: [An operating system is system software that manages computer hardware and software resources, and provides common services for computer programs.], + ), + ( + key: "PDF", + short: "PDF", + long: "Portable Document Format", + desc: [A file format developed by Adobe in the 1990s to present documents, including text formatting and images, in a manner independent of application software, hardware, and operating systems.], + ), + ( + key: "POSIX", + short: "POSIX", + long: "Portable Operating System Interface", + desc: [POSIX is a family of standards specified by the #gls("IEEE") for maintaining compatibility between operating systems.], + ), + ( + key: "PURL", + short: "PURL", + plural: "PURLs", + long: "Package URL", + desc: [A PURL #cite(,form:"normal") is a #gls("URL") string used to identify and locate a software package in a mostly universal and uniform way across programing languages, package managers, packaging conventions, tools, APIs and databases.], + ), + ( + key: "REPL", + short: "REPL", + long: "Read-Eval-Print Loop", + desc: [A read-eval-print loop is an interactive computer programming environment that takes single user inputs, evaluates them, and returns the result to the user.], + ), + ( + key: "SBOM", + short: "SBOM", + plural: "SBOMs", + long: "Software Bill of Materials", + desc: [The software bill of materials is a comprehensive inventory of all components, including libraries, dependencies and versions, that constitute a software product, used for tracking and managing software supply chain security.], + ), + ( + key: "SPDX", + short: "SPDX", + long: "Software Package Data Exchange", + desc: [The @spdx format, created and maintained by the + Linux Foundation, is a standardised way of documenting and communicating the + components, licenses, and copyrights of software packages. It provides a + consistent method for tracking and sharing information about software contents, + particularly in open-source and collaborative environments.], + ), + ( + key: "SHA1", + short: "SHA-1", + long: "Secure Hash Algorithm 1", + desc: [SHA-1 is a hash function which takes an input and produces a 160-bit (20-byte) hash value known as a message digest – typically rendered as 40 hexadecimal digits. It was designed by the United States National Security Agency (NSA), and is a U.S. Federal Information Processing Standard.], + ), + ( + key: "SHA2", + short: "SHA-2", + long: "Secure Hash Algorithm 2", + desc: [SHA-2 is a set of cryptographic hash functions designed by the United States National Security Agency (NSA). It consists of six hash functions with digests (hash values) that are 224, 256, 384 or 512 bits: SHA-224, SHA-256, SHA-384, SHA-512, SHA-512/224, SHA-512/256.], + ), + ( + key: "SE", + short: "SE", + long: "Software Engineering", + desc: [Software Engineering is a computing discipline. It is the systematic application of engineering approaches to the development of software.], + ), + ( + key: "SemVer", + short: "SemVer", + long: "Semantic Versioning", + desc: [Semantic Versioning #cite(, form: "normal") is a versioning scheme for software that uses a three-part version number: `MAJOR.MINOR.PATCH`.], + ), + ( + key: "SRI", + short: "SRI", + long: "Subresource Integrity", + desc: [Subresource Integrity #cite(, form: "normal") is a security feature that allows web developers to ensure that resources they fetch are delivered without unexpected manipulation.], + ), + ( + key: "SVG", + short: "SVG", + long: "Scalable Vector Graphics", + desc: [SVG is an XML-based vector image format.], + ), + ( + key: "SWHID", + short: "SWHID", + long: "Software Heritage Identifier", + desc: [The Software Heritage Identifier #cite(,form:"normal") is a unique identifier for software artifacts, such as source code, that is used to track and reference software across different platforms and systems.], + ), + ( + key: "URL", + short: "URL", + long: "Uniform Resource Locator", + desc: [A URL is a reference to a web resource that specifies its location on a computer network and a mechanism for retrieving it.], + ), +)) diff --git a/src/thesis/imports/colors.typ b/src/thesis/imports/colors.typ new file mode 100644 index 0000000..83e3fa2 --- /dev/null +++ b/src/thesis/imports/colors.typ @@ -0,0 +1 @@ +#let color-a = rgb("#990027") // #5D071D diff --git a/src/thesis/imports/preamble.typ b/src/thesis/imports/preamble.typ new file mode 100644 index 0000000..f7f6b8e --- /dev/null +++ b/src/thesis/imports/preamble.typ @@ -0,0 +1,8 @@ +#import "@preview/diagraph:0.2.5": * +#import "@preview/codelst:2.0.1": sourcecode, sourcefile +#import "@preview/glossarium:0.4.1": make-glossary, print-glossary, gls, glspl +#import "@preview/xarrow:0.3.1": xarrow, xarrowSquiggly, xarrowTwoHead +#import "@preview/hydra:0.4.0": * +#import "@preview/cetz:0.2.2" +#import "colors.typ": * +#import "workarounds.typ": * diff --git a/src/thesis/imports/workarounds.typ b/src/thesis/imports/workarounds.typ new file mode 100644 index 0000000..5a0efb6 --- /dev/null +++ b/src/thesis/imports/workarounds.typ @@ -0,0 +1,57 @@ +#import "@preview/codelst:2.0.1": sourcecode, sourcefile + +#let shell(body) = { + let body = raw(body) + let kinds = ( + "$": green.darken(30%), + "#": blue.darken(10%), + ">": luma(40%), + " ": luma(100%), + ) + let lines = body.text.split("\n").map(line => { + if line.at(0, default: "") in kinds and line.at(1, default: "") == " " { + (line.at(0), line.slice(2)) + } else { + (none, line) + } + }) + + set par(justify: false) + + show raw.line: it => [ + #let (kind, line) = lines.at(it.number - 1) + #if kind != none { + text(fill: kinds.at(kind), kind) + " " + it.body + } else { + text(fill: luma(50%), it.text) + } + ] + + show raw.line: set text(font: "Inconsolata Nerd Font Mono") + + sourcecode(numbers-style: line-no => text( + fill: luma(160), + size: .5em, + str(line-no), + ))[ + #raw(lang: "sh", lines.map(((_, line)) => line).join("\n")) + ] +} + +#let LaTeX = { + [L];box(move( + dx: -4.2pt, dy: -1.2pt, + box(scale(65%)[A]) + ));box(move( + dx: -5.7pt, dy: 0pt, + [T] +));box(move( + dx: -7.0pt, dy: 2.7pt, + box(scale(100%)[E]) +));box(move( + dx: -8.0pt, dy: 0pt, + [X] +));h(-8.0pt) +} + +#show "LaTeX": LaTeX diff --git a/src/thesis/literature.bib b/src/thesis/literature.bib new file mode 100644 index 0000000..3de01bf --- /dev/null +++ b/src/thesis/literature.bib @@ -0,0 +1,951 @@ +@book{Bacon1928, + title = {Opus Majus, Volumes 1 and 2}, + author = {Roger Bacon}, + year = 1928, + publisher = {University of Pennsylvania Press}, + address = {Philadelphia}, + doi = {10.9783/9781512814064}, + isbn = 9781512814064, + origdate = 1267 +} +@book{kpopper1934, + title = {The Logic of Scientific Discovery}, + author = {Popper, K. R.}, + year = 1934, + publisher = {Hutchinson}, + address = {London}, + added-at = {2008-03-11T14:52:34.000+0100}, + biburl = {https://www.bibsonomy.org/bibtex/28fc60d55651f1750c37c770e5286c132/idsia}, + interhash = {5c732a74fb59e943849b250588538875}, + intrahash = {8fc60d55651f1750c37c770e5286c132}, + keywords = {juergen}, + priority = 2, + timestamp = {2008-03-11T14:55:44.000+0100} +} +@article{Thompson84, + title = {Reflections on Trusting Trust}, + author = {Ken Thompson}, + year = 1984, + journal = {Commun. {ACM}}, + volume = 27, + number = 8, + pages = {761--763}, + doi = {10.1145/358198.358210}, + timestamp = {Wed, 14 Nov 2018 10:22:35 +0100}, + biburl = {https://dblp.org/rec/journals/cacm/Thompson84.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +@inproceedings{MarcelFourne2023, + title = {It’s like flossing your teeth: On the Importance and Challenges of Reproducible Builds for Software Supply Chain Security}, + author = {Fourné, Marcel and Wermke, Dominik and Enck, William and Fahl, Sascha and Acar, Yasemin}, + year = 2023, + booktitle = {2023 IEEE Symposium on Security and Privacy (SP)}, + pages = {1527--1544}, + doi = {10.1109/SP46215.2023.10179320} +} +@inbook{Claerbout1992, + title = {Electronic documents give reproducible research a new meaning}, + author = {Jon F. Claerbout and Martin Karrenbach}, + year = 1992, + booktitle = {SEG Technical Program Expanded Abstracts 1992}, + pages = {601--604}, + doi = {10.1190/1.1822162}, + origdate = 1992 +} +@article{Collberg2012, + title = {Repeatability in Computer Systems Research}, + author = {Collberg, Christian and Proebsting, Todd A.}, + year = 2016, + month = feb, + journal = {Commun. ACM}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + volume = 59, + number = 3, + pages = {62–69}, + doi = {10.1145/2812803}, + issn = {0001-0782}, + issue_date = {March 2016}, + abstract = {To encourage repeatable research, fund repeatability engineering and reward commitments to sharing research artifacts.}, + numpages = 8 +} +@article{ESSAWY2020104753, + title = {A taxonomy for reproducible and replicable research in environmental modelling}, + author = {Bakinam T. Essawy and Jonathan L. Goodall and Daniel Voce and Mohamed M. Morsy and Jeffrey M. Sadler and Young Don Choi and David G. Tarboton and Tanu Malik}, + year = 2020, + journal = {Environmental Modelling & Software}, + volume = 134, + pages = 104753, + doi = {10.1016/j.envsoft.2020.104753}, + issn = {1364-8152}, + keywords = {Reproducibility, Replicability, Containers, Docker, Singularity}, + abstract = {Despite the growing acknowledgment of reproducibility crisis in computational science, there is still a lack of clarity around what exactly constitutes a reproducible or replicable study in many computational fields, including environmental modelling. To this end, we put forth a taxonomy that defines an environmental modelling study as being either 1) repeatable, 2) runnable, 3) reproducible, or 4) replicable. We introduce these terms with illustrative examples from hydrology using a hydrologic modelling framework along with cyberinfrastructure aimed at fostering reproducibility. Using this taxonomy as a guide, we argue that containerization is an important but lacking component needed to achieve the goal of computational reproducibility in hydrology and environmental modelling. Examples from hydrology are provided to demonstrate how new tools, including a user-friendly tool for containerization of computational analyses called Sciunit, can lower the barrier to reproducibility and replicability in the environmental modelling community.} +} +@book{NAP25303, + title = {Reproducibility and Replicability in Science}, + author = {National Academies of Sciences, Engineering, and Medicine}, + year = 2019, + publisher = {The National Academies Press}, + address = {Washington, DC}, + doi = {10.17226/25303}, + isbn = {978-0-309-48616-3}, + abstract = {One of the pathways by which the scientific community confirms the validity of a new scientific discovery is by repeating the research that produced it. When a scientific effort fails to independently confirm the computations or results of a previous study, some fear that it may be a symptom of a lack of rigor in science, while others argue that such an observed inconsistency can be an important precursor to new discovery.\nConcerns about reproducibility and replicability have been expressed in both scientific and popular media. As these concerns came to light, Congress requested that the National Academies of Sciences, Engineering, and Medicine conduct a study to assess the extent of issues related to reproducibility and replicability and to offer recommendations for improving rigor and transparency in scientific research.\nReproducibility and Replicability in Science defines reproducibility and replicability and examines the factors that may lead to non-reproducibility and non-replicability in research. Unlike the typical expectation of reproducibility between two computations, expectations about replicability are more nuanced, and in some cases a lack of replicability can aid the process of scientific discovery. This report provides recommendations to researchers, academic institutions, journals, and funders on steps they can take to improve reproducibility and replicability in science.} +} +@article{cacioppo2015social, + title = {Social, behavioral, and economic sciences perspectives on robust and reliable science}, + author = {Cacioppo, John T and Kaplan, Robert M and Krosnick, Jon A and Olds, James L and Dean, Heather}, + year = 2015 +} +@article{Schwab2000, + title = {Making scientific computations reproducible}, + author = {Schwab, M. and Karrenbach, N. and Claerbout, J.}, + year = 2000, + journal = {Computing in Science & Engineering}, + volume = 2, + number = 6, + pages = {61--67}, + doi = {10.1109/5992.881708} +} +@article{Barba2018, + title = {Terminologies for Reproducible Research}, + author = {Lorena A. Barba}, + year = 2018, + month = feb, + journal = {arXiv}, + doi = {10.48550/arXiv.1802.03311}, + eprint = {1802.03311}, + archiveprefix = {arXiv} +} +@article{Donoho2009, + title = {Reproducible Research in Computational Harmonic Analysis}, + author = {Donoho, David L. and Maleki, Arian and Rahman, Inam Ur and Shahram, Morteza and Stodden, Victoria}, + year = 2009, + journal = {Computing in Science & Engineering}, + volume = 11, + number = 1, + pages = {8--18}, + doi = {10.1109/MCSE.2009.15} +} +@article{Goodman2016, + title = {What does research reproducibility mean?}, + author = {Steven N. Goodman and Daniele Fanelli and John P. A. Ioannidis}, + year = 2016, + journal = {Science Translational Medicine}, + volume = 8, + number = 341, + pages = {341ps12--341ps12}, + doi = {10.1126/scitranslmed.aaf5027}, + abstract = {The language and conceptual framework of “research reproducibility” are nonstandard and unsettled across the sciences. The language and conceptual framework of “research reproducibility” are nonstandard and unsettled across the sciences. In this Perspective, we review an array of explicit and implicit definitions of reproducibility and related terminology, and discuss how to avoid potential misunderstandings when these terms are used as a surrogate for “truth.”} +} +@article{Peng2009, + title = {{Reproducible research and Biostatistics}}, + author = {Peng, Roger D.}, + year = 2009, + month = {07}, + journal = {Biostatistics}, + volume = 10, + number = 3, + pages = {405--408}, + doi = {10.1093/biostatistics/kxp014}, + issn = {1465-4644} +} +@article{Acm2018, + title = {Reproducibility in Scientific Computing}, + author = {Ivie, Peter and Thain, Douglas}, + year = 2018, + month = jul, + journal = {ACM Comput. Surv.}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + volume = 51, + number = 3, + doi = {10.1145/3186266}, + issn = {0360-0300}, + issue_date = {May 2019}, + abstract = {Reproducibility is widely considered to be an essential requirement of the scientific process. However, a number of serious concerns have been raised recently, questioning whether today’s computational work is adequately reproducible. In principle, it should be possible to specify a computation to sufficient detail that anyone should be able to reproduce it exactly. But in practice, there are fundamental, technical, and social barriers to doing so. The many objectives and meanings of reproducibility are discussed within the context of scientific computing. Technical barriers to reproducibility are described, extant approaches surveyed, and open areas of research are identified.}, + articleno = 63, + numpages = 36, + keywords = {computational science, replicability, reproducible, workflows, scientific computing, scientific workflow, scientific workflows, workflow, Reproducibility} +} +@article{Castillo1669, + title = {The Scientific Method: A Need for Something Better?}, + author = {M. Castillo}, + year = 2013, + journal = {American Journal of Neuroradiology}, + publisher = {American Journal of Neuroradiology}, + volume = 34, + number = 9, + pages = {1669--1671}, + doi = {10.3174/ajnr.A3401}, + issn = {0195-6108} +} +@article{abs-2104-06020, + title = {Reproducible Builds: Increasing the Integrity of Software Supply Chains}, + author = {Chris Lamb and Stefano Zacchiroli}, + year = 2021, + journal = {CoRR}, + volume = {abs/2104.06020}, + doi = {10.48550/arXiv.2104.06020}, + eprinttype = {arXiv}, + eprint = {2104.06020}, + timestamp = {Mon, 19 Apr 2021 16:45:47 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-06020.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +@inproceedings{malka-hal-04430009, + title = {{Reproducibility of Build Environments through Space and Time}}, + author = {Malka, Julien and Zacchiroli, Stefano and Zimmermann, Th{\'e}o}, + year = 2024, + month = apr, + booktitle = {{46th International Conference on Software Engineering (ICSE 2024) - New Ideas and Emerging Results (NIER) Track}}, + address = {Lisbonne, Portugal}, + doi = {10.48550/arXiv.2402.00424}, + keywords = {reproducibility ; development environment ; functional package management ; nix}, + pdf = {https://arxiv.org/pdf/2402.00424}, + hal_id = {hal-04430009}, + hal_version = {v1} +} +@article{TSE2019, + title = {What Do Package Dependencies Tell Us About Semantic Versioning?}, + author = {Decan, Alexandre and Mens, Tom}, + year = 2021, + journal = {IEEE Transactions on Software Engineering}, + volume = 47, + number = 6, + pages = {1226--1240}, + doi = {10.1109/TSE.2019.2918315}, + keywords = {Ecosystems;Software;Semantics;Packaging;Libraries;Java;Computer bugs;Software engineering;distribution, maintenance, and enhancement;maintainability;version control;metrics/measurement;reusable software;reusable libraries;configuration management;software release management and delivery} +} +@inproceedings{malka-hal-04482192, + title = {{Increasing Trust in the Open Source Supply Chain with Reproducible Builds and Functional Package Management}}, + author = {Malka, Julien}, + year = 2024, + month = apr, + booktitle = {{46th International Conference on Software Engineering (ICSE 2024) - Doctoral Symposium (DS) Track}}, + address = {Lisbonne, Portugal}, + doi = {10.1145/3639478.3639806}, + url = {https://hal.science/hal-04482192}, + keywords = {Software supply chain ; Reproducible builds ; Functional package management}, + pdf = {https://hal.science/hal-04482192/file/main.pdf}, + hal_id = {hal-04482192}, + hal_version = {v1} +} +@inproceedings{solarwinds-9579611, + title = {Solar Winds Hack: In-Depth Analysis and Countermeasures}, + author = {Alkhadra, Rahaf and Abuzaid, Joud and AlShammari, Mariam and Mohammad, Nazeeruddin}, + year = 2021, + booktitle = {2021 12th International Conference on Computing Communication and Networking Technologies (ICCCNT)}, + pages = {1--7}, + doi = {10.1109/ICCCNT51525.2021.9579611}, + keywords = {Technological innovation;Computer hacking;Shape;Supply chains;Government;Companies;Software systems;cybersecurity;hack;malware;cyberattack;supply chain;supply chain attack} +} +@article{mueller2012stuxnet, + title = {The stuxnet worm}, + author = {Mueller, Paul and Yadegari, Babak}, + year = 2012, + journal = {Département des sciences de l'informatique, Université de lArizona}, + url = {https://www2.cs.arizona.edu/~collberg/Teaching/466-566/2012/Resources/presentations/topic9-final/report.pdf} +} +@article{Heartbleed101, + title = {Heartbleed 101}, + author = {Carvalho, Marco and DeMott, Jared and Ford, Richard and Wheeler, David A.}, + year = 2014, + journal = {IEEE Security & Privacy}, + volume = 12, + number = 4, + pages = {63--67}, + doi = {10.1109/MSP.2014.66}, + keywords = {Electronic commerce;Software development;Payloads;Privacy;Computer crashes;Resource management;cybercrime;security;hackers;Heartbleed} +} +@misc{ReproducibleBuildsOrg, + title = {Reproducible Builds Website}, + author = {{Reproducible Builds}}, + year = 2015, + url = {https://reproducible-builds.org/} +} +@book{LibCManual, + title = {The GNU C library reference manual}, + author = {Loosemore, Sandra and Stallman, Richard and McGrath, Roland and Oram, Andrew and Drepper, Ulrich}, + year = 2023, + url = {https://sourceware.org/glibc/manual/2.39/pdf/libc.pdf} +} +@book{tlpi, + title = {The Linux Programming Interface: A Linux and UNIX System Programming Handbook}, + author = {Kerrisk, Michael}, + year = 2010, + publisher = {No Starch Press}, + address = {USA}, + isbn = 1593272200, + edition = {1st}, +} +@misc{diffoscope, + title = {diffoscope}, + author = {{Reproducible Builds}}, + year = 2014, + url = {https://diffoscope.org} +} +@misc{strip-nondeterminism, + title = {Strip Nondeterminism}, + author = {{Strip Nondeterminism}}, + year = 2014, + url = {https://salsa.debian.org/reproducible-builds/strip-nondeterminism} +} +@article{courtes_2022_6581453, + title = {Building a Secure Software Supply Chain with GNU Guix}, + author = {Courtès, Ludovic}, + year = 2022, + month = jun, + journal = {The Art, Science, and Engineering of Programming}, + publisher = {Aspect-Oriented Software Association (AOSA)}, + volume = 7, + number = 1, + doi = {10.22152/programming-journal.org/2023/7/1}, + issn = {2473-7321} +} +@misc{acm_artifact_review_badging, + title = {Artifact Review and Badging}, + author = {{Association for Computing Machinery}}, + year = 2020, + url = {https://www.acm.org/publications/policies/artifact-review-and-badging-current}, + shortauthor = {ACM} +} +@phdthesis{Dolstra2006, + title = {The Purely Functional Software Deployment Model}, + author = {Dolstra, Eelco}, + year = 2006, + month = jan, + address = {Utrecht, The Netherlands}, + url = {https://dspace.library.uu.nl/handle/1874/7540}, + tags = {programming languages, rule-based, deployment, completeness, meta programming, software components, meta-model, modeling language, modeling, language modeling, functional programming, software component, source-to-source, information models, programming, Meta-Environment, process modeling, systematic-approach, open-source}, + researchr = {https://researchr.org/publication/Dolstra2006}, + school = {Utrecht University}, + advisor = {Eelco Visser and S. Doaitse Swierstra} +} +@misc{guixwebsite, + title = {Guix Website}, + author = {{Guix}}, + year = 2012, + url = {https://guix.gnu.org/} +} +@misc{courtes2013functional, + title = {Functional Package Management with Guix}, + author = {Courtès, Ludovic}, + year = 2013, + doi = {10.48550/arXiv.1305.4584}, + eprint = {1305.4584}, + archiveprefix = {arXiv}, + primaryclass = {cs.PL} +} +@misc{fsfwebsite, + title = {The Free Software Foundation}, + author = {{Free Software Foundation}}, + year = 1985, + url = {https://www.fsf.org/} +} +@misc{opencontainerinitiative, + title = {Open Container Initiative}, + author = {{Open Container Initiative}}, + year = 2015, + url = {https://opencontainers.org/} +} +@misc{podman, + title = {Podman}, + author = {{Podman}}, + year = 2018, + url = {https://podman.io/} +} +@software{kubernetes, + title = {Kubernetes}, + author = {{Kubernetes}}, + year = 2014, + url = {https://kubernetes.io/} +} +@misc{docker, + title = {Docker}, + author = {{Docker, Inc.}}, + year = 2013, + url = {https://www.docker.com/} +} +@misc{dockerhub, + title = {Docker Hub}, + author = {{Docker, Inc.}}, + year = 2013, + url = {https://hub.docker.com/} +} +@misc{nix, + title = {Nix}, + author = {{Nix}}, + year = 2003, + url = {https://nixos.org/} +} +@misc{Executive-Order-14028, + title = {Executive Order 14028: Improving the Nation's Cybersecurity}, + author = {{Joe Biden}}, + year = 2021, + url = {https://www.federalregister.gov/d/2021-10460} +} +@software{typst, + title = {{Typst}}, + author = {Mädje, Laurenz and Haug, Martin and {The Typst Project Developers}}, + url = {https://github.com/typst/typst}, + license = {Apache-2.0} +} +@misc{typstdoc, + title = {Typst documentation website}, + author = {{Typst Documentation}}, + year = 2023, + url = {https://typst.app/docs/} +} +@article{hinsenKonrad2020guix, + title = {Reproducible computations with Guix}, + author = {Hinsen, Konrad}, + year = 2020, + url = {https://guix.gnu.org/en/blog/2020/reproducible-computations-with-guix/} +} +@online{drupolPrOnNix, + title = {Pol Dellaiera's pull requests in the Nix project}, + author = {{Github NixOS project}}, + year = 2021, + url = {https://github.com/NixOS/nixpkgs/pulls?page=1&q=is%3Apr+is%3Aclosed+author%3Adrupol} +} +@misc{drupolReviewOnNix, + title = {Pol Dellaiera's reviews in the Nix project}, + author = {{Github NixOS project}}, + year = 2021, + url = {https://github.com/NixOS/nixpkgs/pulls?q=is%3Apr+is%3Aclosed+reviewed-by%3Adrupol} +} +@misc{drupolPhpBuilderOnNix, + title = {Nix builder for PHP applications}, + author = {{Github NixOS project}}, + year = 2021, + url = {https://github.com/NixOS/nixpkgs/pull/225401} +} +@misc{drupolComposerReproduciblePr, + title = {Composer reproducible pull request}, + author = {{Github Composer project}}, + year = 2021, + url = {https://github.com/composer/composer/pull/11663} +} +@misc{drupolIpc2023Talk, + title = {Leverage Nix in the PHP ecosystem}, + author = {{International PHP Conference}}, + year = 2023, + url = {https://phpconference.com/web-development/leveraging-nix-php-ecosystem/} +} +@misc{drupolPhpunitPR5576, + title = {Add composer.lock}, + author = {{Github PHPunit project}}, + year = 2023, + url = {https://github.com/sebastianbergmann/phpunit/pull/5576} +} +@misc{drupolPsyshIssue767, + title = {Consider adding a composer.lock file in the repository}, + author = {{Github Psysh project}}, + year = 2023, + url = {https://github.com/bobthecow/psysh/issues/767} +} +@misc{drupolGrumphpIssue1095, + title = {Consider adding a composer.lock file in the repository}, + author = {{Github GrumPHP project}}, + year = 2023, + url = {https://github.com/phpro/grumphp/issues/1095} +} +@misc{drupolPsalmIssue10446, + title = {Adding composer.lock in VCS ?}, + author = {{Github Psalm project}}, + year = 2023, + url = {https://github.com/vimeo/psalm/issues/10446} +} +@misc{drupolPhpmdIssue1056, + title = {Adding composer.lock in VCS ?}, + author = {{Github PHPMD project}}, + year = 2023, + url = {https://github.com/phpmd/phpmd/issues/1056} +} +@misc{drupolPhpCsFixerIssue7590, + title = {Publish composer.lock for each release}, + author = {{Github PHP-CS-Fixer project}}, + year = 2023, + url = {https://github.com/PHP-CS-Fixer/PHP-CS-Fixer/issues/7590} +} +@misc{drupolPhpParallelLintIssue153, + title = {Adding composer.lock under VCS ?}, + author = {{Github PHP Parallel Lint project}}, + year = 2023, + url = {https://github.com/php-parallel-lint/PHP-Parallel-Lint/issues/153} +} +@misc{drupolRBPR102, + title = {flake: add Nix flake files}, + author = {{Reproducible Builds project}}, + year = 2023, + url = {https://salsa.debian.org/reproducible-builds/reproducible-website/-/merge_requests/102} +} +@misc{joinupSignal, + title = {European Commission to use open source messaging service Signal}, + author = {{Joinup - Open Source Observatory}}, + year = 2020, + url = {https://joinup.ec.europa.eu/collection/open-source-observatory-osor/news/signal-messaging-service} +} +@software{signalApp, + title = {Signal, A private messenger}, + author = {{Signal Foundation}}, + year = 2013, + url = {https://https://signal.org} +} +@misc{openAIKeynote2023, + title = {OpenAI DevDay: Opening Keynote}, + author = {Sam Altman}, + year = 2023, + url = {https://youtu.be/U9mJuUkhUzk?t=421} +} +@misc{NixOS, + title = {NixOS}, + author = {Eelco Dolstra}, + year = 2003, + url = {https://nixos.org/} +} +@misc{typstReproducibleBuildIssue1, + title = {Enhancing reproducibility of Typst documents with SOURCE_DATE_EPOCH}, + author = {{Github Typst project}}, + year = 2024, + url = {https://github.com/typst/typst/issues/3806} +} +@book{clarke1973, + title = {Profiles of the Future; an Inquiry into the Limits of the Possible}, + author = {Clarke, Arthur C.}, + year = 1973, + publisher = {Harper & Row} +} +@misc{typstReproducibleBuildIssue2, + title = {More control over typst compilation environment}, + author = {{Github Typst project}}, + year = 2024, + url = {https://github.com/typst/typst/issues/3892} +} +@misc{drupolRBPR113, + title = {Add bibliography.bib file to the repository and update Academic Publications page accordingly}, + author = {{Reproducible Builds project}}, + year = 2023, + url = {https://salsa.debian.org/reproducible-builds/reproducible-website/-/merge_requests/113} +} +@misc{drupolRBPR114, + title = {buy-in: add SBOM and ephemeral development environments}, + author = {{Reproducible Builds project}}, + year = 2023, + url = {https://salsa.debian.org/reproducible-builds/reproducible-website/-/merge_requests/114} +} +@misc{php, + title = {PHP}, + author = {Rasmus Lerdorf}, + year = 1994, + url = {https://php.net/} +} +@misc{composer, + title = {Composer}, + author = {Adermann, Nils and Boggiano, Jordi}, + year = 2011, + url = {https://getcomposer.org/} +} +@misc{CVE-2024-3094, + title = {{CVE}-2024-3094}, + author = {{National Institute of Standards and Technology}}, + year = 2024, + month = mar, + url = {https://nvd.nist.gov/vuln/detail/CVE-2024-3094}, + urldate = {7 April 2024} +} +@misc{xz, + title = {XZ Utils}, + author = {Lasse Collin}, + year = 2009, + url = {https://tukaani.org/xz/} +} +@misc{devs-profile, + title = {Devs profile}, + author = {{European Commission}}, + year = 2022, + url = {https://code.europa.eu/ecphp/devs-profile} +} +@misc{ecphpSessions, + title = {ECPHP Sessions}, + author = {{European Commission}}, + year = 2020, + url = {https://code.europa.eu/ecphp/sessions} +} +@online{teb, + title = {La Tronche En Biais}, + author = {Thomas C. Durand and Vled Tapas}, + url = {https://www.youtube.com/user/TroncheEnBiais}, + date = 2014, + organization = {Youtube} +} +@online{teb-r13y-crisis-lib, + title = {SCIENCES : Une crise de reproductibilité des études ?}, + author = {Thomas C. Durand and Vled Tapas}, + url = {https://www.youtube.com/watch?v=OA8Eki7fvAw}, + date = 2024, + organization = {La Tronche En Biais} +} +@article{Larigaldie2024, + title = {eyeScrollR: A software method for reproducible mapping of eye-tracking data from scrollable web pages}, + author = {Larigaldie, Nathanael and Dreneva, Anna and Orquin, Jacob L.}, + year = 2024, + month = feb, + day = 12, + journal = {Behavior Research Methods}, + doi = {10.3758/s13428-024-02343-1}, + issn = {1554-3528}, + abstract = {The Internet has become an important part of our lives and an increasing number of researchers use eye-tracking technology to examine attention and behavior in online environments. Researchers, however, face a significant challenge in mapping eye-tracking data from scrollable web pages. We describe the R package eyeScrollR for mapping eye-tracking data from scrollable content such as web pages. The package re-maps eye-tracking gaze coordinates to full-page coordinates with a deterministic algorithm based on mouse scroll data. The package includes options for handling common situations, such as sticky menus or ads that remain visible when the user scrolls. We test the package's validity in different hardware and software settings and on different web pages and show that it is highly accurate when tested against manual coding. Compared to current methods, eyeScrollR provides a more reproducible and reliable approach for mapping eye-tracking data from scrollable web pages. With its open code and free availability, we recommend eyeScrollR as an essential tool for eye-tracking researchers, particularly those who adhere to open-science principles. The eyeScrollR package offers a valuable contribution to the field of eye-tracking research, facilitating accurate and standardized analysis of eye-tracking data in web scrolling contexts.} +} +@article{peng2011, + title = {Reproducible Research in Computational Science}, + author = {Roger D. Peng}, + year = 2011, + journal = {Science}, + volume = 334, + number = 6060, + pages = {1226--1227}, + doi = {10.1126/science.1213847}, + abstract = {Computational science has led to exciting new developments, but the nature of the work has exposed limitations in our ability to evaluate published findings. Reproducibility has the potential to serve as a minimum standard for judging scientific claims when full independent replication of a study is not possible.} +} +@misc{gnumake, + title = {GNU Make}, + author = {{The Free Software Foundation}}, + url = {https://www.gnu.org/software/make/} +} +@misc{drupolPhpBuilderOnNixV2, + title = {Nix builder for PHP applications, version 2}, + author = {{Github NixOS project}}, + year = 2024, + url = {https://github.com/NixOS/nixpkgs/pull/308059} +} +@misc{PhpSrcReproduciblePhar, + title = {Make PHAR reproducible}, + year = 2023, + url = {https://github.com/theseer/Autoload/issues/114} +} +@misc{drupolRBMonthlyReports, + title = {{Git repository of the reproducible-builds.org website}}, + year = 2024, + url = {https://salsa.debian.org/reproducible-builds/reproducible-website/-/commits/master?author=Pol%20Dellaiera} +} +@misc{SignalReproducible, + title = {Reproducible Signal builds for Android}, + year = 2016, + url = {https://signal.org/blog/reproducible-android/} +} +@misc{TelegramReproducible, + title = {Reproducible Builds for iOS and Android}, + year = 2019, + url = {https://telegram.org/blog/verifiable-apps-and-more} +} +@misc{ghActions, + title = {GitHub Actions}, + url = {https://github.com/features/actions} +} +@misc{sri, + title = {Subresource Integrity. W3C Recommendation. W3C}, + author = {Akhawe, Devdatta and Braun, Frederik and Marier, Francois and Weinberger, Joel}, + year = 2016, + url = {https://www.w3.org/TR/SRI/} +} +@software{terraform, + title = {Terraform}, + author = {{HashiCorp}}, + year = 2014, + url = {https://www.terraform.io/} +} +@software{chef, + title = {Chef}, + author = {{Chef}}, + year = 2009, + url = {https://www.chef.io/} +} +@software{puppet, + title = {Puppet}, + author = {{Puppet}}, + year = 2005, + url = {https://puppet.com/} +} +@software{ansible, + title = {Ansible}, + author = {{Ansible}}, + year = 2012, + url = {https://www.ansible.com/} +} +@misc{devcontainer, + title = {Developing inside a Container}, + author = {{Visual Studio Code}}, + year = 2021, + url = {https://containers.dev/} +} +@software{vscode, + title = {Visual Studio Code}, + author = {{Microsoft}}, + year = 2015, + url = {https://code.visualstudio.com/} +} +@software{git, + title = {Git}, + author = {Linus Torvalds}, + year = 2005, + url = {https://git-scm.com/} +} +@misc{nix24release, + title = {Nix 2.4 release announcement}, + author = {Eelco Dolstra}, + year = 2021, + url = {https://discourse.nixos.org/t/nix-2-4-released/15822} +} + +@inbook{swh, + title = {The Software Heritage Open Science Ecosystem}, + isbn = {9783031360602}, + doi = {10.1007/978-3-031-36060-2_2}, + booktitle = {Software Ecosystems}, + publisher = {Springer International Publishing}, + author = {Cosmo, Roberto Di and Zacchiroli, Stefano}, + year = {2023}, + pages = {33–61} +} + +@inproceedings{10-1007-978-3-319-27308-2_47, + author = {Courtès, Ludovic and Wurmus, Ricardo}, + title = {Reproducible and User-Controlled Software Environments in HPC with Guix}, + booktitle = {Euro-Par 2015: Parallel Processing Workshops}, + year = {2015}, + publisher = {Springer International Publishing}, + address = {Cham}, + pages = {579--591}, + abstract = {Support teams of high-performance computing (HPC) systems often find themselves between a rock and a hard place: on one hand, they understandably administrate these large systems in a conservative way, but on the other hand, they try to satisfy their users by deploying up-to-date tool chains as well as libraries and scientific software. HPC system users often have no guarantee that they will be able to reproduce results at a later point in time, even on the same system---software may have been upgraded, removed, or recompiled under their feet, and they have little hope of being able to reproduce the same software environment elsewhere. We present GNU Guix and the functional package management paradigm and show how it can improve reproducibility and sharing among researchers with representative use cases.}, + isbn = {978-3-319-27308-2}, + doi = {10.1007/978-3-319-27308-2_47} +} + +@misc{octoverse2022, + title = {The State of the Octoverse 2022}, + author = {{GitHub}}, + year = 2022, + url = {https://octoverse.github.com/2022/state-of-open-source} +} + +@misc{purl, + title = {PURL Specification}, + author = {Philippe Ombredanne}, + year = 2017, + month = Nov, + url = {https://github.com/package-url/purl-spec} +} + +@article{preston2013semantic, + title = {Semantic Versioning 2.0. 0}, + author = {Preston-Werner, Tom}, + journal = {l{\'\i}nea]. Available: http://semver. org}, + year = {2013} +} + +@book{CRA, + author = {{European Commission} and {DG CNECT}}, + title = {Cyber Resilience Act}, + year = {2022}, + url = {https://ec.europa.eu/newsroom/dae/redirection/document/89543} +} + +@inproceedings{hal-01865790, + title = {{Identifiers for Digital Objects: the Case of Software Source Code Preservation}}, + author = {Di Cosmo, Roberto and Gruenpeter, Morane and Zacchiroli, Stefano}, + url = {https://hal.science/hal-01865790}, + booktitle = {{iPRES 2018 - 15th International Conference on Digital Preservation}}, + address = {Boston, United States}, + pages = {1-9}, + year = {2018}, + month = Sep, + doi = {10.17605/OSF.IO/KDE56}, + pdf = {https://hal.science/hal-01865790v4/file/main.pdf}, + hal_id = {hal-01865790}, + hal_version = {v4}, +} + +@misc{nixpkgs-pull-256270, + title = {stdenv: option to determine SOURCE_DATE_EPOCH in fetchers}, + author = {{Github NixOS project}}, + year = 2023, + month = Sep, + url = {https://github.com/NixOS/nixpkgs/pull/256270}, +} + +@software{guile, + title = {GNU Guile}, + author = {{GNU Guile}}, + year = 1993, + url = {https://www.gnu.org/software/guile/} +} + +@book{dybvig2009scheme, + title = {The SCHEME programming language}, + author = {Dybvig, R Kent}, + year = {2009}, + publisher = {Mit Press} +} + +@misc{PolMasterThesis, + title = {Reproducibility in Software Engineering}, + author = {Dellaiera, Pol}, + year = 2024, + url = {https://github.com/drupol/master-thesis} +} + +@misc{CCBy40, + title = {Creative Commons Attribution 4.0 International}, + author = {{Creative Commons}}, + year = 2013, + url = {https://creativecommons.org/licenses/by/4.0/} +} + +@misc{HypocraticLicence, + title = {The Hippocratic License 3.0}, + author = {Ada Ehmke, Coraline}, + url = {https://firstdonoharm.dev/} +} + +@misc{ITripleE, + title = {IEEE website}, + author = {{IEEE}}, + year = 1963, + url = {https://www.ieee.org/} +} + +@misc{r13yBuildScenarios, + title = {Reproducibility in Software Engineering}, + author = {Dellaiera, Pol}, + year = 2024, + url = {https://github.com/drupol/r13y-build-scenarios/} +} + +@misc{drupolNixpkgsCommitter, + title = {Nixpkgs committer}, + year = 2023, + url = {https://github.com/orgs/NixOS/teams/nixpkgs-committers?query=drupol} +} + +@misc{NewComputerModern, + title = {The New Computer Modern Font CTAN project}, + author = {Tsolomitis, Antonis}, + year = 2019, + url = {https://ctan.org/pkg/newcomputermodern} +} + +@misc{spdx, + title = {Software Package Data Exchange (SPDX)}, + author = {{Software Package Data Exchange}}, + year = 2010, + url = {https://spdx.org/} +} + +@misc{cyclonedx, + title = {CycloneDX}, + author = {{CycloneDX}}, + year = 2019, + url = {https://cyclonedx.org/} +} + +@misc{SWHArchive, + title = {Software Heritage Archive Website}, + author = {{Software Heritage Archive}}, + year = 2016, + url = {https://archive.softwareheritage.org/} +} + +@misc{DrupolNixCommitter, + title = {Nixpkgs committer grant}, + author = {Dellaiera, Pol}, + year = 2023, + url = {https://github.com/NixOS/nixpkgs/issues/50105#issuecomment-1571885173} +} + +@misc{NixpkgsCommitters, + title = {Nixpkgs committers}, + author = {{Nixpkgs}}, + url = {https://github.com/orgs/NixOS/teams/nixpkgs-committers} +} + +@misc{lix, + title = {Lix}, + author = {{Lix}}, + year = 2024, + url = {https://lix.systems/} +} + +@misc{githubSBOMPURL, + title = {Generated SBOM files will now include a package URL when a manifest file includes a range}, + author = {{GitHub}}, + year = 2024, + url = {https://github.blog/changelog/2024-06-11-generated-sbom-files-will-now-include-a-package-url-when-a-manifest-file-includes-a-range/} +} + +@misc{SoN200PolDellaiera, + title = {Summer Of Nix 2022 - State of Nix at European Commission}, + author = {Dellaiera, Pol}, + year = 2022, + url = {https://www.youtube.com/watch?v=I7wdcJ3YhoU} +} + +@misc{ArXiV, + title = {arXiv}, + author = {{arXiv}}, + year = 1991, + url = {https://arxiv.org/} +} + +@misc{aux, + title = {Aux}, + author = {{Aux}}, + year = 2024, + url = {https://aux.computer/} +} + +@book{HunterGCP, + title = "Google Cloud Platform for Developers", + author = "Hunter, Ted and Porter, Steven", + publisher = "Packt Publishing", + month = jul, + year = 2018, + address = "Birmingham, England" +} + +@inproceedings{Traugott2002, + author = {Traugott, Steve}, + year = {2002}, + month = {01}, + pages = {99-120}, + title = {Why Order Matters: Turing Equivalence in Automated Systems Administration.} +} + +@article{SurveyFlakyTests, + author = {Parry, Owain and Kapfhammer, Gregory M. and Hilton, Michael and McMinn, Phil}, + title = {A Survey of Flaky Tests}, + year = {2021}, + issue_date = {January 2022}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + volume = {31}, + number = {1}, + issn = {1049-331X}, + doi = {10.1145/3476105}, + abstract = {Tests that fail inconsistently, without changes to the code under test, are described as flaky. Flaky tests do not give a clear indication of the presence of software bugs and thus limit the reliability of the test suites that contain them. A recent survey of software developers found that 59\% claimed to deal with flaky tests on a monthly, weekly, or daily basis. As well as being detrimental to developers, flaky tests have also been shown to limit the applicability of useful techniques in software testing research. In general, one can think of flaky tests as being a threat to the validity of any methodology that assumes the outcome of a test only depends on the source code it covers. In this article, we systematically survey the body of literature relevant to flaky test research, amounting to 76 papers. We split our analysis into four parts: addressing the causes of flaky tests, their costs and consequences, detection strategies, and approaches for their mitigation and repair. Our findings and their implications have consequences for how the software-testing community deals with test flakiness, pertinent to practitioners and of interest to those wanting to familiarize themselves with the research area.}, + journal = {ACM Trans. Softw. Eng. Methodol.}, + month = {oct}, + articleno = {17}, + numpages = {74}, + keywords = {Flaky tests, software testing} +} + +@misc{barba2022definingroleopensource, + title = {Defining the role of open source software in research reproducibility}, + author = {Lorena A. Barba}, + year = {2022}, + eprint = {2204.12564}, + archivePrefix = {arXiv}, + primaryClass = {cs.CY}, + url = {https://arxiv.org/abs/2204.12564}, +} diff --git a/src/thesis/main.typ b/src/thesis/main.typ new file mode 100644 index 0000000..292f514 --- /dev/null +++ b/src/thesis/main.typ @@ -0,0 +1,49 @@ +#import "imports/preamble.typ": * +#import "theme/template.typ": * + +#set document( + title: title, + author: author, + date: none, + keywords: ( + "university", + "umons", + "june 2024", + "master thesis", + "reproducibility", + "r13y", + "compilation", + "docker", + "nix", + "guix", + ), +) + +#show: project.with( + title: title, + university: university, + faculty: faculty, + degree: degree, + program: program, + supervisor: supervisor, + advisors: advisors, + author: author, + startDate: startDate, + submissionDate: submissionDate, + disclaimer: include "disclaimer.typ", + acknowledgement: include "acknowledgement.typ", + abstract: include "abstract.typ", + glossary: include "glossary.typ", + accessibility: include "accessibility.typ", + extra: include "extra.typ" +) + +#include "1-introduction.typ" + +#include "2-reproducibility.typ" + +#leftblank(weak: false) + +#include "3-tools.typ" + +#include "4-conclusion.typ" diff --git a/src/thesis/theme/UMONS-fs-logo.typ b/src/thesis/theme/UMONS-fs-logo.typ new file mode 100644 index 0000000..b4d1bf5 --- /dev/null +++ b/src/thesis/theme/UMONS-fs-logo.typ @@ -0,0 +1,15 @@ +#import "./common/metadata.typ": * +#import "./colors.typ": * + +#{ + set text(font: "Liberation Sans") + set par(leading: 6pt) + box[#image("./UMONS_FS-logo.svg", height: 60pt)] + box[ + #v(.7em) + #text(size: .3em, fill: umons-grey)[ + Faculty of\ + sciences + ] + ] +} diff --git a/src/thesis/theme/UMONS-logo.svg b/src/thesis/theme/UMONS-logo.svg new file mode 100644 index 0000000..6644d21 --- /dev/null +++ b/src/thesis/theme/UMONS-logo.svg @@ -0,0 +1,73 @@ + + + + + + + + UMONS + University of Mons + + + diff --git a/src/thesis/theme/UMONS-logo.typ b/src/thesis/theme/UMONS-logo.typ new file mode 100644 index 0000000..7d2e58c --- /dev/null +++ b/src/thesis/theme/UMONS-logo.typ @@ -0,0 +1,21 @@ +#import "./common/metadata.typ": * +#import "./colors.typ": * + +#{ + set text(font: "Liberation Sans") + set align(right) + set par(leading: 6pt) + { + set text(weight: 300) + upper[ + #text(fill: umons-grey)[#underline( + offset: 4pt, + stroke: umons-red, + )[U]]#text(fill: umons-red)[mons]\ + ] + } + { + set text(size: .35em, weight: 250) + text(fill: umons-grey)[University of Mons] + } +} diff --git a/src/thesis/theme/UMONS_FS-logo.svg b/src/thesis/theme/UMONS_FS-logo.svg new file mode 100644 index 0000000..44ef35d --- /dev/null +++ b/src/thesis/theme/UMONS_FS-logo.svg @@ -0,0 +1,82 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/thesis/theme/UMONS_FS.svg b/src/thesis/theme/UMONS_FS.svg new file mode 100644 index 0000000..559c76f --- /dev/null +++ b/src/thesis/theme/UMONS_FS.svg @@ -0,0 +1,94 @@ + + + + + + + + + + + + + + + + + + + Faculty of sciences + + diff --git a/src/thesis/theme/abstract.typ b/src/thesis/theme/abstract.typ new file mode 100644 index 0000000..6d1ff8e --- /dev/null +++ b/src/thesis/theme/abstract.typ @@ -0,0 +1,9 @@ +#import "common/metadata.typ": * + +#let abstract(title: "Abstract", body) = { + pagebreak(weak: true) + + heading(level: 1, title, outlined: false) + + body +} diff --git a/src/thesis/theme/acknowledgement.typ b/src/thesis/theme/acknowledgement.typ new file mode 100644 index 0000000..7aab802 --- /dev/null +++ b/src/thesis/theme/acknowledgement.typ @@ -0,0 +1,9 @@ +#import "common/metadata.typ": * + +#let acknowledgement(title: "Acknowledgements", body) = { + pagebreak(weak: true) + + heading(level: 1, title, outlined: false) + + body +} diff --git a/src/thesis/theme/colors.typ b/src/thesis/theme/colors.typ new file mode 100644 index 0000000..11441d6 --- /dev/null +++ b/src/thesis/theme/colors.typ @@ -0,0 +1,7 @@ +// Source: https://web.umons.ac.be/app/uploads/sites/8/2017/07/REF-COULEURS-FACS-UMONS.pdf +#let umons-red = rgb(168, 0, 57) +#let umons-turquoise = rgb(0, 171, 204) +#let umons-grey = rgb(150, 150, 150) +#let umons-yellow = rgb(246, 167, 35) + +#let umons-faculty-sciences = rgb(2, 150, 135) diff --git a/src/thesis/theme/common/metadata.typ b/src/thesis/theme/common/metadata.typ new file mode 100644 index 0000000..5d01b2f --- /dev/null +++ b/src/thesis/theme/common/metadata.typ @@ -0,0 +1,55 @@ +// Enter your thesis data here: +#let title = "Reproducibility in Software Engineering" +#let subtitle = none +#let university = "University of Mons" +#let faculty = "Faculty of Sciences" +#let degree = "Master" +#let program = "Computer Science" +#let view = degree + "’s Thesis in " + program +#let supervisor = [#link("https://orcid.org/0000-0003-3636-5020")[Prof. Dr. Tom Mens#box(image("../../../../resources/images/ORCIDiD_iconvector.svg", width: 10pt))]] +#let advisors = none +#let author = "Pol Dellaiera" +#let authorOrcId = "0009-0008-7972-7160" +#let startDate = "2023 - 2024" +#let submissionDate = "12 June 2024" +#let body-font = "New Computer Modern" +#let sans-font = "New Computer Modern Sans" +#let page-margin = (inside: 3cm, outside: 2cm, top: 20mm, bottom: 20mm) +#let rev = if "rev" in sys.inputs { + sys.inputs.rev +} else { + "" +} +#let shortRev = if "shortRev" in sys.inputs { + sys.inputs.shortRev +} else { + "" +} +#let builddate = if "builddate" in sys.inputs { + sys.inputs.builddate +} else { + "" +} + +// Default font sizes from original LaTeX style file. +#let font-defaults = ( + tiny: 6pt, + scriptsize: 7pt, + footnotesize: 9pt, + small: 9pt, + normalsize: 10pt, + large: 12pt, + Large: 14pt, + LARGE: 17pt, + huge: 20pt, + Huge: 25pt, +) + +#let font = ( + Large: font-defaults.Large + 0.4pt, // Actual font size. + footnote: font-defaults.footnotesize, + large: font-defaults.large, + small: font-defaults.small, + normal: font-defaults.normalsize, + script: font-defaults.scriptsize, +) diff --git a/src/thesis/theme/common/titlepage.typ b/src/thesis/theme/common/titlepage.typ new file mode 100644 index 0000000..7e096ed --- /dev/null +++ b/src/thesis/theme/common/titlepage.typ @@ -0,0 +1,105 @@ +#import "metadata.typ": * + +#let titlepage( + title: "", + subtitle: "", + university: "", + faculty: "", + degree: "", + program: "", + supervisor: "", + advisors: (), + author: "", + authorOrcId: "", + startDate: none, + submissionDate: none, + rev: none, + shortRev: none, + builddate: none, +) = { + set page( + margin: (top: 1cm, left: 1cm, right: 1cm, bottom: 1cm), + numbering: none, + number-align: center, + header: { + place(top + left)[ + #pad(top: 10pt)[ + #set text(size: 4em) + #include "../UMONS-fs-logo.typ" + ] + ] + place(top + right)[ + #pad(top: 10pt)[ + #set text(size: 4em) + #include "../UMONS-logo.typ" + ] + ] + }, + footer: align( + center, + text(font: sans-font)[ + #faculty #sym.diamond.filled.small + #university #sym.diamond.filled.small + 20, Place du Parc #sym.diamond.filled.small + B-7000 Mons + ], + ), + ) + + set text( + font: body-font, + size: 12pt, + lang: "en", + ) + + place(center + horizon, dy: 3em)[ + #align(center, text(font: sans-font, 2em, weight: 700, title)) + + #if subtitle != none { + align(center, text(font: sans-font, 1em, weight: 700, subtitle)) + } + + #align(center, text(font: sans-font, 1.3em, weight: 100, view)) + + #grid( + columns: 3, + gutter: 1em, + align: (right, left, left), + strong("Author"), + ":", + { + link("https://orcid.org/" + authorOrcId)[#author#box( + image( + "../../../../resources/images/ORCIDiD_iconvector.svg", + width: 10pt, + ), + )] + }, + strong("Supervisor"), + ":", + supervisor, + ..if advisors != none { + (strong("Advisors"), ":", advisors.join(", ")) + }, + ..if startDate != none { + (strong("Academic year"), ":", startDate) + }, + ..if submissionDate != none { + (strong("Submission date"), ":", submissionDate) + }, + ..if builddate != "" { + (strong("Build date"), ":", builddate) + }, + ..if shortRev != "" { + ( + strong("Revision"), + ":", + link( + "https://github.com/drupol/master-thesis/commit/" + rev, + shortRev, + ), + ) + }, + ) + ] +} diff --git a/src/thesis/theme/definition.typ b/src/thesis/theme/definition.typ new file mode 100644 index 0000000..a8204ea --- /dev/null +++ b/src/thesis/theme/definition.typ @@ -0,0 +1,35 @@ +#import "infos.typ": * +#import "common/metadata.typ": * + +#let definition(term: none, name: none, content) = { + let kind = "Definition" + let supplement = upper(kind.first()) + lower(kind.slice(1)) + let name = if name == none { + "def-" + term + } else { + name + } + + show figure.where(kind: "definition"): it => info-box( + kind: "definition", + settings: (prefix: [#smallcaps[#it.caption]]), + )[ + #it.body + ] + show figure.caption.where(kind: "definition"): it => block( + width: 100%, + { + it + }, + ) + + [ + #figure( + kind: "definition", + supplement: [#supplement], + caption: term, + numbering: "1", + content, + ) #label(name) + ] +} diff --git a/src/thesis/theme/disclaimer.typ b/src/thesis/theme/disclaimer.typ new file mode 100644 index 0000000..6879d8a --- /dev/null +++ b/src/thesis/theme/disclaimer.typ @@ -0,0 +1,25 @@ +#import "common/metadata.typ": * + +#let disclaimer( + title: "", + degree: "", + author: "", + submissionDate: none, + signature: none, +) = { + pagebreak(weak: true) + + v(80%) + text("I confirm that this " + degree + "’s thesis is my own work and I have documented all sources and material used.") + + v(15mm) + grid( + columns: 2, + gutter: 1fr, + "Mons, " + submissionDate, + { + set align(right) + author + signature + }, + ) +} diff --git a/src/thesis/theme/glossary.typ b/src/thesis/theme/glossary.typ new file mode 100644 index 0000000..4cb936c --- /dev/null +++ b/src/thesis/theme/glossary.typ @@ -0,0 +1,20 @@ +#import "../imports/preamble.typ": * +#import "common/metadata.typ": * + +#let glossary( + title: "Glossary", + terms: (), +) = { + pagebreak(weak: true) + + [ + #{ + heading(level: 1, title, outlined: false) + } + ] + + + v(10mm) + + print-glossary(show-all: true, terms) +} diff --git a/src/thesis/theme/infos.typ b/src/thesis/theme/infos.typ new file mode 100644 index 0000000..c783c03 --- /dev/null +++ b/src/thesis/theme/infos.typ @@ -0,0 +1,147 @@ +#import "./colors.typ": * + +#let info-settings = ( + info: ( + prefix: none, + icon: "circle-info", + fill_color: umons-turquoise.lighten(90%), + stroke_color: umons-turquoise, + ), + cite: ( + prefix: none, + icon: "quote-left", + fill_color: rgb("#ffffff"), + stroke_color: black, + ), + definition: ( + prefix: [#underline(smallcaps[*Definition*])], + icon: "highlighter-solid", + fill_color: umons-faculty-sciences.lighten(90%), + stroke_color: umons-faculty-sciences, + ), + question: ( + prefix: none, + icon: "circle-question", + fill_color: umons-yellow.lighten(90%), + stroke_color: umons-yellow, + ), + important: ( + prefix: none, + icon: "circle-exclamation", + fill_color: rgb("#228B22").lighten(90%), + stroke_color: rgb("#228B22").darken(20%), + ), + conclusion: ( + prefix: none, + icon: "lightbulb-solid", + fill_color: umons-red.lighten(90%), + stroke_color: umons-red, + ), + good: ( + prefix: none, + icon: "circle-check", + fill_color: umons-grey.lighten(90%), + stroke_color: umons-grey.darken(20%), + ), + note: ( + prefix: [ *Note:* ], + icon: "note-sticky", + fill_color: umons-grey.lighten(90%), + stroke_color: umons-grey.darken(20%), + ), +); + +#let info-stroke(kind: "good") = info-settings.at(kind).stroke_color + +#let info-image(kind: "info", ..args) = { + let settings = info-settings.at(kind) + image("solid/" + settings.icon + ".svg", ..args, alt: settings.icon) +} + +#let info-box( + body, + settings: (:), + kind: "info", + radius: 5pt, + footer: none, + icon: true, + ref: none, +) = { + set par( + leading: 0.55em, + justify: true, + ) + + let settings = info-settings.at(kind) + settings + let extra = if footer == none { + none + } else { + v(.5em) + h(1fr) + text(size: .75em)[#footer] + } + + set align(left) + + box( + width: 0.8fr, + fill: settings.fill_color, + stroke: .5pt + settings.stroke_color, + radius: radius, + inset: 0pt, + )[ + #let body = if kind == "cite" { + quote(attribution: extra, quotes: false)[#emph(body)] + } else { + body + } + + #let contents = if icon { + ( + image( + "../../../resources/images/" + settings.icon + ".svg", + width: 32pt, + ), + ( + { + settings.prefix + body + extra + } + ), + ) + } else { + ( + { + settings.prefix + body + extra + } + ) + } + #figure( + kind: "info-box", + supplement: [Info box], + { + set align(left) + table( + columns: if icon { + (38pt, 1fr) + } else { + 1 + }, + inset: 10pt, + stroke: none, + column-gutter: 10pt, + ..contents, + ) + }, + ) #{ + if ref != none { + label(ref) + } else { + none + } + } + ] +} diff --git a/src/thesis/theme/leftblank.typ b/src/thesis/theme/leftblank.typ new file mode 100644 index 0000000..760d672 --- /dev/null +++ b/src/thesis/theme/leftblank.typ @@ -0,0 +1,17 @@ +#import "common/metadata.typ": * + +#let leftblank( + weak: true +) = { + pagebreak(weak: weak) + + set align(center) + + v(80%) + + text(fill: black.lighten(75%))[ + This page is intentionally left blank. + ] + + pagebreak(weak: true) +} diff --git a/src/thesis/theme/template.typ b/src/thesis/theme/template.typ new file mode 100644 index 0000000..6d534e3 --- /dev/null +++ b/src/thesis/theme/template.typ @@ -0,0 +1,306 @@ +#import "../imports/preamble.typ": * +#import "common/metadata.typ": * +#import "common/titlepage.typ": * +#import "disclaimer.typ": * +#import "leftblank.typ": * +#import "acknowledgement.typ": * +#import "abstract.typ": * +#import selectors: * + +#let getHeader() = { + locate(loc => { + let page-counter = counter(page) + let current = page-counter.at(loc).first() + + let chapter = hydra( + 1, + display: (_, it) => { + if it.numbering != none { + [#numbering(it.numbering, ..counter(heading).at(it.location())) - #it.body] + } + }, + ) + + let section = hydra( + selectors.by-level(min: 2), + display: (_context, element) => element.body, + ) + let items = (smallcaps(chapter), h(1fr), emph(section)) + + if calc.even(current) { + items.rev() + } else { + items + }.join() + + if (chapter != none) { + [#line(length: 100%, stroke: .2pt + rgb("#000000").lighten(65%))] + } + }) +} + +#let getFooter() = { + locate(loc => { + let page-counter = counter(page) + let current = page-counter.at(loc).first() + let items = ([#current], h(1fr), emph(title)) + + if calc.even(current) { + items + } else { + items.rev() + }.join() + }) +} + +#let chapterquote( + title: none, + ref: none, + quoteText: none, + quoteAttribution: none, +) = { + pagebreak() + + place(top + left, dx: 45pt, dy:45pt)[ + #rect(width: 50pt, height: 50pt, fill: rgb(125, 125, 125)) + ] + + place(top + left)[ + #rect(width: 70pt, height: 70pt, fill: rgb(0, 0, 0)) + ] + + v(10%) + + [ + #{ + heading(title, level: 1) + } + #label(ref) + ] + + if quoteText != none { + show quote: set pad(x: 0em) + quote( + block: true, + attribution: [#{ + if quoteAttribution != none { + cite(form: "prose", quoteAttribution) + } + }], + quoteText, + ) + } + + pagebreak() +} + +#let project( + title: "", + university: "", + faculty: "", + degree: "", + program: "", + supervisor: "", + advisors: (), + author: "", + startDate: none, + submissionDate: none, + disclaimer: none, + acknowledgement: none, + abstract: none, + glossary: none, + accessibility: none, + extra: none, + body, +) = { + // --- Page configuration --- + set page( + margin: page-margin, + numbering: "1", + number-align: center, + header: getHeader(), + footer: getFooter(), + paper: "a4", + ) + + titlepage( + title: title, + subtitle: subtitle, + university: university, + faculty: faculty, + degree: degree, + program: program, + supervisor: supervisor, + advisors: advisors, + author: author, + authorOrcId: authorOrcId, + startDate: startDate, + submissionDate: submissionDate, + rev: rev, + shortRev: shortRev, + builddate: builddate, + ) + + // --- Typography --- + set text( + font: body-font, + size: font.normal, + lang: "en", + hyphenate: false, + ) + + // --- Paragraphs --- + // Source: https://typst.app/docs/guides/guide-for-latex-users/ + set par(justify: true) + show par: set block(spacing: 1em) + + show ref: it => { + let el = it.element + + if el == none { + return it + } + + if el.has("level") and el.level == 1 { + let (chapter,) = counter(heading).at(el.label) + link(el.label)[Chapter #chapter] + } else if el.has("level") and el.level == 2 { + let (chapter, section) = counter(heading).at(el.label) + link(el.label)[Chapter #chapter, section #section] + } else if el.has("level") and el.level == 3 { + let (chapter, section, subsection) = counter(heading).at(el.label) + link(el.label)[Chapter #chapter, section #section.#subsection] + } else if el.has("level") and el.level == 4 { + let ( + chapter, + section, + subsection, + subsubsection, + ) = counter(heading).at(el.label) + link(el.label)[Chapter #chapter, subsection #section.#subsection.#subsubsection] + } else { + link(el.label)[#it] + } + } + + // --- Citations --- + set cite( + form: "prose", + style: "ieee", + ) + show cite: cite => { + underline(cite, stroke: .2pt + rgb("#000000").lighten(65%)) + } + + // --- Links --- + show link: it => { + underline(it, stroke: .2pt + rgb("#000000").lighten(65%)) + } + + leftblank(weak: false) + [#disclaimer] + leftblank(weak: false) + [#abstract] + leftblank(weak: false) + [#acknowledgement] + leftblank(weak: false) + [#accessibility] + [#extra] + [#glossary] + + leftblank(weak: false) + + { + // --- Table of Contents --- + { + set par( + leading: 0.45em, + justify: true, + ) + show outline.entry.where(level: 1): it => { + v(12pt, weak: true) + strong(it) + } + + heading(numbering: none, outlined: false)[Contents] + outline(title: "", indent: 1.5em, depth: 3) + } + + leftblank(weak: false) + + // --- Headings --- + set heading(numbering: "1.") + show heading.where(level: 1): set heading( + numbering: "I", + supplement: [Chapter], + ) + show heading.where(level: 2): set heading( + numbering: "1.", + supplement: [Section], + ) + show heading.where(level: 3): set heading( + numbering: "1.", + supplement: [Subsection], + ) + show heading.where(level: 1): it => block({ + set text( + 2em, + weight: "bold", + ) + v(8em) + if it.numbering != none [ + Chapter #numbering(it.numbering, ..counter(heading).at(it.location())) + #v(.5em) + ] + it.body + v(0.5em) + }) + + // --- Various outlines --- + show outline.entry.where(level: 1): it => { + v(1em, weak: true) + it + } + + // --- Raw text configuration --- + show raw.line: set text(font: "Inconsolata Nerd Font Mono") + + // --- Equations --- + show math.equation: set text(weight: 400) + + // --- Figures --- + show figure.caption: it => [*#it.supplement #it.counter.display()*: #it.body] + show figure.where(kind: "table"): set figure.caption(position: top) + body + } + + { + set par( + leading: 1em, + justify: true, + ) + + // List of definitions. + [#heading(numbering: none)[List of definitions] #label("list-of-definitions")] + outline(title: "", target: figure.where(kind: "definition")) + + leftblank(weak: false) + + // List of figures. + pagebreak() + [#heading(numbering: none)[List of figures] #label("list-of-figures")] + outline(title: "", target: figure.where(kind: image)) + + leftblank(weak: false) + + // List of tables. + pagebreak() + [#heading(numbering: none)[List of tables] #label("list-of-tables")] + outline(title: "", target: figure.where(kind: "table")) + + leftblank(weak: false) + + [#heading("Bibliography", level: 1, outlined: true) #label("bibliography")] + bibliography("../literature.bib", full: true, style: "ieee", title: none) + } +}