From 7bfe0e42c628265d69129e1c931a602bb78b15fd Mon Sep 17 00:00:00 2001 From: Pol Dellaiera Date: Wed, 11 Sep 2024 10:58:14 +0200 Subject: [PATCH] work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress work in progress remove environment section --- .../configuration-management-summary.typ | 32 +- resources/typst/configuration-management.typ | 40 +- src/thesis/2-reproducibility.typ | 391 ++++++++++-------- src/thesis/literature.bib | 14 + 4 files changed, 268 insertions(+), 209 deletions(-) diff --git a/resources/typst/configuration-management-summary.typ b/resources/typst/configuration-management-summary.typ index 3f33dd3..d2bf062 100644 --- a/resources/typst/configuration-management-summary.typ +++ b/resources/typst/configuration-management-summary.typ @@ -1,5 +1,6 @@ #import "../../src/thesis/imports/preamble.typ": * +#set align(left) #table( columns: (1fr, 1fr, 1fr), @@ -7,40 +8,35 @@ table.header( [], table.vline(stroke: 1pt), - [#align(center)[Imperative]], + [#align(center)[*Imperative*]], table.vline(stroke: .5pt), - [#align(center)[Declarative]], + [#align(center)[*Declarative*]], table.hline(stroke: 1pt), ), - table.cell(align: horizon + center)[Divergent], + table.cell(align: horizon + center)[*Divergent*], + table.hline(stroke: .5pt), [ - Shell commands ], [ - Shell scripts - - Ansible ], + table.cell(align: horizon + center)[*Convergent*], table.hline(stroke: .5pt), - table.cell(align: horizon + center, rowspan: 2)[Convergent], - table.cell(colspan: 2)[ - - Docker - ], - table.hline(stroke: .5pt + luma(200), start: 1), [ - - Ansible - - Chef - - Shell scripts + - Ansible #cite(,form:"normal") + - Chef #cite(,form:"normal") + - Docker #cite(,form:"normal") ], [ - - Puppet - - Kubernetes - - Terraform + - Puppet #cite(,form:"normal") + - Terraform #cite(,form:"normal") ], table.hline(stroke: .5pt), - table.cell(align: horizon + center)[Congruent], + table.cell(align: horizon + center)[*Congruent*], [], [ - - Nix - - Guix + - Guix #cite(,form:"normal") + - Nix #cite(,form:"normal") ], ) diff --git a/resources/typst/configuration-management.typ b/resources/typst/configuration-management.typ index 4b8de6c..ccf06e6 100644 --- a/resources/typst/configuration-management.typ +++ b/resources/typst/configuration-management.typ @@ -10,13 +10,13 @@ set text(font: "Virgil 3 YOFF") cetz.canvas({ import cetz.plot - import cetz.draw: content + cetz.draw.set-style(axes: (shared-zero: false)) plot.plot( - size: (3.5, 3), - y-label: [state], - x-label: [Time], + size: (3, 3), axis-style: "school-book", + y-label: [State], + x-label: [Time], x-tick-step: none, y-tick-step: none, x-min: 0, @@ -24,16 +24,17 @@ x-grid: true, y-min: 0, y-max: 500, - legend: "legend.north", { plot.add( - ((75, 75), (450, 300)), + ((75, 75), (450, 500)), mark: "o", + mark-style: (stroke: blue, fill: white), mark-size: .1, ) plot.add( ((75, 50), (450, 125)), mark: "o", style: (stroke: (paint: red, dash: "dashed")), + mark-style: (stroke: red, fill: white), mark-size: .1, ) }, ) @@ -49,13 +50,17 @@ set text(font: "Virgil 3 YOFF") cetz.canvas({ import cetz.plot - import cetz.draw: * + cetz.draw.set-style(axes: (shared-zero: false)) plot.plot( - size: (3.5, 3), + size: (3, 3), y-label: [State], x-label: [Time], axis-style: "school-book", + legend-style: ( + default-position: "legend.north", + stroke: none, + ), x-tick-step: none, y-tick-step: none, x-min: 0, @@ -63,19 +68,20 @@ x-grid: true, y-min: 0, y-max: 500, - legend: "legend.north", { plot.add( - ((75, 75), (450, 300)), - style: (stroke: (paint: blue)), + ((75, 125), (450, 300)), + style: (stroke: (paint: red, dash: "dashed")), + mark-style: (stroke: red, fill: white), mark-size: .1, mark: "o", - label: "actual", + label: "Target", ) plot.add( ((75, 500), (450, 325)), + style: (stroke: (paint: blue), mark: (fill: blue, stroke: blue)), + mark-style: (stroke: blue, fill: white), mark-size: .1, mark: "o", - label: "target", - style: (stroke: (paint: red, dash: "dashed")), + label: "Actual", ) }, ) @@ -91,10 +97,10 @@ set text(font: "Virgil 3 YOFF") cetz.canvas({ import cetz.plot - import cetz.draw: * + cetz.draw.set-style(axes: (shared-zero: false)) plot.plot( - size: (3.5, 3), + size: (3, 3), y-label: [State], x-label: [Time], axis-style: "school-book", @@ -110,11 +116,13 @@ plot.add( ((75, 75), (450, 300)), mark: "o", + mark-style: (stroke: blue, fill: white), mark-size: .1, ) plot.add( ((75, 50), (450, 275)), mark: "o", style: (stroke: (paint: red, dash: "dashed")), + mark-style: (stroke: red, fill: white), mark-size: .1, ) }, ) diff --git a/src/thesis/2-reproducibility.typ b/src/thesis/2-reproducibility.typ index 39e5d2a..2a12eee 100644 --- a/src/thesis/2-reproducibility.typ +++ b/src/thesis/2-reproducibility.typ @@ -301,28 +301,32 @@ reproducibility. #definition( name: "def-reproducibility-build-time", - term: "Reproducibility at build time", + term: "Reproducibility at build-time", )[ - Reproducibility at build time refers to the ability to consistently generate - the same executable or software artefact from a given source code across - different builds on different environments, across different space and time. - This aspect is crucial in ensuring that the software compilation process is - deterministic and immune to variances in development environments, compiler - versions, or build tools. It involves a meticulous standardisation and - documentation of the build environment and dependencies to guarantee that the - same executable is produced regardless of when or where the build occurs. + Reproducibility at build-time refers to the ability to produce an identical + build artefact #eg[a binary executable, library or container image] + consistently across multiple build attempts, environments (space), or points + in time. This means that given the same source code and build instructions, + the build process yields the same output every time, regardless of where or + when the build occurs. Achieving build-time reproducibility requires + controlling all aspects of the build environment, including dependency + versions, build tools, and any external resources, to eliminate variability + and non-determinism in the build process. ] #definition( name: "def-reproducibility-run-time", - term: "Reproducibility at run time", + term: "Reproducibility at run-time", )[ - Reproducibility at run time addresses the consistency of software behaviour - and output when the software is executed in different environments or under - varying conditions. This type of reproducibility focuses on ensuring that the - software performs identically and produces the same results regardless of the - #gls("OS"), underlying hardware, or external dependencies it interacts - with during execution. + Reproducibility at run-time refers to the ability of a software system or + application to behave consistently and produce the same results each time it + is executed, given the same inputs and environment. This means that when the + software is run in different environments or at different times, it performs + identically, providing the same functionality and output. Achieving run-time + reproducibility involves ensuring that the execution environment is controlled + and consistent, including the #gls("OS"), hardware configuration, environment + variables, and any external dependencies or services the software interacts + with. ] To illustrate these phases, the C source code in @montecarlo-pi.c implements the @@ -1468,10 +1472,204 @@ and at any point in the past or future​​​​. feasible to replicate the same output in a different environment, within the same architecture, achieving exact temporal replication of the build process is practically impossible. This temporal variability serves as a critical - indicator of potential difficulties in ensuring reproducibility across diverse - environments or machines. + indicator of potential difficulties in ensuring reproducibility across a + variety of environments or machines. ] +==== Configuration Management + +Reproducibility relies on stable, consistent and well-maintained codebases but +also heavily depends on stable, consistent and well-maintained environments as +seen in (add ref to ch2-environments). In addition, a critical component is +environment configuration management. Configuration management plays a critical +role inensuring reproducibility by mitigating the non-deterministic behaviours +introduced by configuration drifts. + +#info-box[ + Configuration drifts occurs when changes to an environment + accumulate over time, leading to variations that deviate from the desired or + initial configuration state, thus introducing non-determinism. +] + +This section examines key configuration management models and paradigms, +their impact on reproducibility, and the tools that enforce these principles in +modern software environments. + +Another source of non-determinism potentially arises from inconsistent +environment configurations. The way environments are managed directly affects +the environment behaviours and inherently, reproducibility. Therefore, +configuration management plays an important role in mitigating non-determinism +by ensuring that environments #eg[software installations or software builds] +remain consistent in space and time. + +@Traugott2002 classify environment configuration management into three +categories, each of which has a distinct impact on the level of determinism +achieved: + +#figure(include "../../resources/typst/configuration-management.typ") + +===== Divergent Configuration Management + +In this model (@divergent-config-management), environments are typically managed +by one or more individuals, which inevitably leads to configuration drifts and +where the configurations deviate over time. This is an unavoidable process when +system modifications are performed without centralised control, leading to +unpredictable and non-deterministic behaviour, making reproducibility almost +impossible. Reducing reliance on manual adjustments is essential to achieving +higher levels of system predictability and reproducibility. This challenge can +become particularly problematic in self-modifying environments, potentially +leading to potential circular dependencies​ issues. A common example of such an +environment is a newly installed operating system that initially shares a +uniform configuration. Over time, as users customise their environments by +running shell commands or scripts to suit individual preferences, the system’s +state diverges from its original, well-defined configuration. + +===== Convergent Configuration Management + +Once a configuration drift is identified as an issue, the focus shifts towards +convergence, bringing environments back to a known and consistent state, as +illustrated in @convergent-config-management. While efforts are made to +standardise configurations, achieving exact uniformity is extremely challenging, +if not impossible. Environments may progressively "converge" towards a common +state, but subtle differences can persist, introducing variability. To +illustrate this model, we could think of an arbitrary environment that needs to +be configured in a specific way, reach a particular well known state. For +example, some specific dependencies has to be installed to run a particular +service. Tools like Puppet #cite(, form: "normal"), Chef +#cite(, form: "normal"), Terraform #cite(,form: "normal") +and Ansible #cite(, form: "normal") might help to achieve this goal. + +While convergent management offers flexibility in responding to unforeseen +changes in the environment, it is prone to feedback loops that may cause +unexpected behaviour​. Such feedback loops make it difficult to +achieve complete reproducibility, as the system's progression towards the +desired state is not guaranteed to follow a deterministic path. + +#info-box[ + Feedback loops #cite(, form: "normal") + refer to a situation where the system continuously reacts to its own changes + or state modifications, often in an unintended or uncontrollable way. + Specifically, feedback loops occur when the system detects deviations from its + desired state and repeatedly attempts to correct them. However, each + corrective action may introduce new changes that the system tries to react to + again, leading to a continuous loop of corrections and adjustments. This + behaviour can be problematic because it may cause the system to never fully + stabilise or reach its intended state. Instead, the system keeps cycling + through adjustments based on previous changes, which can result in + unpredictable outcomes. +] + +===== Congruent Configuration Management + +This approach in @congruent-config-management enforces strict consistency across +all environments, ensuring that each environment maintains an identical +configuration. By preventing configuration drift from the outset, congruent +configuration management aims to eliminate one of the key sources of +non-determinism. Maintaining identical setups across environments is a central +goal of this model, providing the highest level of determinism and reliability +in system behaviours. + +Congruent management, particularly through the adoption of immutable +environment ((add ref to ch2-environments)), ensures that environment remain in a +well-defined state, thus maximising reproducibility. However, this approach can +lack the flexibility required for dynamic environments, where each minor +adjustments may necessitate rebuilding the entire system. This limitation +highlights the importance of carefully choosing between convergent and congruent +approaches based on the environment's needs. + +#info-box[ + Immutable environments ((add ref to ch2-environments)) are environments that are designed + to be unchangeable once they are created. They are often used in containers + #eg[Docker #cite(,form:"normal")], where the ability to quickly create + and destroy environments is essential. Immutable environments enhance + reproducibility and reliability, making them an ideal choice for environments + that require high levels of predictability and stability. +] + +Tools such as Guix or Nix have demonstrated that it is possible to achieve a +high degree of congruence while allowing controlled divergence in specific areas +such as databases, logs or secret management​. This balance highlights the +flexibility required to maintain reproducibility in environments that manage +both static system components and dynamic data. + +On top of specifying configuration management models, we can also distinguish +two different configuration management paradigms. + +#figure( + include "../../resources/typst/configuration-management-summary.typ", + caption: [Configuration Management Models and Paradigms], + kind: "table", + supplement: [Table], +) + +===== Imperative Configuration Management + +This paradigm specifies the exact steps required to transition an environment +from its current state to the desired state. Tools such as +Ansible #cite(, form: "normal"), Chef #cite(, form: "normal"), +Docker #cite(, form: "normal"), and shell commands exemplify this +paradigm. While imperative configurations enable the use of complex logic and +conditional operations, they can be challenging to maintain due to their +non-idempotent nature, meaning the same script may yield different results +depending on the environment's initial state. + +The expressiveness of imperative tools allows for stronger assumptions about the +environment's current state, which increases the likelihood of configuration +drift as environments diverge over time. Ensuring consistency in an imperative +approach often demands extensive error handling, validation checks, and retries +to guarantee that, despite the sequential nature of the process, the system +ultimately reaches a stable state. + +While this approach requires careful management to ensure consistency, providing +detailed control at the expense of simplicity and predictability, some +imperative tools can achieve a level of congruence, but this often comes at the +expense of predictability and ease of maintenance, making them less suitable in +environments where stability and simplicity are prioritised. + +===== Declarative Configuration Management + +Declarative configuration management ensure idempotence, meaning the same +configuration can be applied multiple times without altering the environment +beyond its intended state. This abstraction simplifies understanding and +maintenance by allowing the system to determine the necessary actions to achieve +the desired state. Tools such as Puppet #cite(, form: "normal"), +Kubernetes #cite(,form: "normal"), +Terraform #cite(,form: "normal") and, under some conditions, +Docker #cite(, form: "normal") are used to specify the desired end +state. These tools typically feature their own specific #gls("DSL") to create +high-level descriptions of the expected environment's state, as opposed to +issuing imperative and procedural commands. The declarative approach mitigates +the risk of configuration drift by prioritising idempotence, maintaining +explicit dependency graphs, and ensuring a strong awareness of the current state +of the environment​​ #cite(,form:"normal", supplement: [p. 348]). + +#info-box(kind: "note")[ + In @ch2-table-configuration-mgmt, Docker #cite(, form: "normal") and + Ansible #cite(, form: "normal") are classified as both declarative + and imperative. This dual classification arises from the fact that while + they often start with declarative configurations + #eg[a `Dockerfile`, a `playbook`], they can shift towards an imperative + approach when shell commands are introduced within those files to achieve + the desired state. As a result for Docker, the same `Dockerfile` may produce + different outcomes depending on the base image in use. Similarly, Ansible + might behave differently depending on the current state of the machine it + runs on. This dual nature can lead to non-idempotent behaviour, hindering + reproducibility. + + Classifying these tools into distinct categories is far from being trivial. + Some tools feature comprehensive #gls("DSL") that are agnostic of the + underlying #gls("OS") and architecture, functioning independently without the + need of additional dependencies. In contrast, other tools rely on external + technologies, further complicating the distinction between imperative and + declarative configuration management. The boundaries between these paradigms + are often blurred, as it's rarely a matter of black and white. For instance, + while Nix and Guix are primarily categorised as declarative, they occasionally + rely on imperative languages #eg[shell scripts, Python] to perform specific + tasks. This illustrates that even tools labelled as declarative can integrate + aspects of imperative configuration management, adding nuance to their + classification. +] + === Sources Of Non-Determinism In this section we will explore the sources of non-determinism in software @@ -1718,163 +1916,6 @@ as `-u`, and the `LC_ALL` environment variable to the `date` command. This approach ensures that the output we receive is predictable and consistent, regardless of the underlying system configuration. -==== Environments and Configuration Management - -In the context of #gls("SE"), reproducibility not only relies on stable -codebases but also heavily depends on consistent and well-maintained -environments. Configuration management plays a critical role in ensuring -reproducibility by mitigating the non-deterministic behaviours introduced by -configuration drift. - -#info-box[ -Configuration drift occurs when changes to an environment -accumulate over time, leading to variations that deviate from the desired or -initial configuration state, thus introducing non-determinism. -] - -This section examines key configuration management models, -their impact on reproducibility, and the tools that enforce these principles in -modern software environments. - -Another source of non-determinism arises from inconsistent environment -configurations. The way environments are managed directly affects the -environment behaviours and inherently, reproducibility. Therefore, configuration -management plays an important role in mitigating non-determinism by ensuring -that systems, software installations and software builds remain consistent -across different environments. - -@Traugott2002 classify environment configuration management into three -categories, each of which has a distinct impact on the level of determinism -achieved: - -#figure(include "../../resources/typst/configuration-management.typ") - -===== Divergent Configuration Management - -In this model (@divergent-config-management), environments are typically managed -by one or more individuals, which inevitably leads to -#emph[configuration drift], where the configurations of different systems -deviate over time. This is an unavoidable process when system modifications are -performed without centralised control, leading to unpredictable and -non-deterministic behaviour, making reproducibility almost impossible in complex -infrastructures. Reducing reliance on manual adjustments is essential to -achieving higher levels of system predictability and reproducibility. -A common example of this model is a newly installed operating system that -initially shares a uniform configuration. Over time, as users customise their -environments to suit individual preferences, the system’s state diverges from -its original, well-defined configuration. - -===== Convergent Configuration Management - -Once configuration drift is identified as an issue, the focus shifts towards -convergence, bringing systems back to a known and consistent state, as -illustrated in @convergent-config-management. While efforts are made to -standardise configurations, achieving exact uniformity is extremely challenging, -if not impossible. Systems may progressively "converge" towards a common -configuration, but subtle differences can persist, introducing variability. The -goal in this model is to minimise these variations as much as possible, though -complete uniformity is rarely attained. To illustrate this model, we could think -of an arbitrary environment that needs to be configured in a specific way, reach -a particular well known state. For example, some specific dependencies has to be -installed. Tools like Puppet #cite(, form: "normal"), -Kubernetes #cite(,form: "normal"), -Terraform #cite(,form: "normal"), -Ansible #cite(, form: "normal"). -While convergent management offers flexibility in responding to unforeseen -changes in the environment, it is prone to feedback loops that may cause -unexpected behaviour​. Such feedback loops make it difficult to achieve complete -reproducibility, as the system's progression towards the desired state is not -guaranteed to follow a deterministic path. - -===== Congruent Configuration Management - -This approach in @congruent-config-management enforces strict consistency across -all environments, ensuring that each environment maintains an identical -configuration. By preventing configuration drift from the outset, congruent -configuration management aims to eliminate one of the key sources of -non-determinism. Maintaining identical setups across environments is a central -goal of this model, providing the highest level of determinism and reliability -in system behaviours. To illustrate this model, we could think of an arbitrary -environment that needs to be configured in a specific way. - -Congruent management, particularly through the adoption of immutable -environment, ensures that systems remain in a well-defined state, thus -maximising reproducibility. However, this approach can lack the flexibility -required for dynamic environments, where each minor adjustments may necessitate -rebuilding the entire system. This limitation highlights the importance of -carefully choosing between convergent and congruent approaches based on the -system's needs. - -Tools such as Nix or Guix have demonstrated that it is possible to achieve a -high degree of congruence while allowing controlled divergence in specific areas -such as databases or secret management​. This balance between convergence and -congruence highlights the flexibility required to maintain reproducibility in -environments that manage both static system components and dynamic data. - -On top of specifying configuration management models, we can also distinguish -two different configuration management paradigms. - -===== Imperative Configuration Management - -This paradigm specifies the exact steps required to transition an environment -from its current state to the desired state. Tools such as -Ansible #cite(, form: "normal"), Chef #cite(, form: "normal"), -Docker #cite(, form: "normal"), and shell scripts exemplify this -methodology. While imperative configurations enable the use of complex logic and -conditional operations, they can be challenging to maintain due to their -non-idempotent nature, meaning the same script may yield different results -depending on the environment's initial state. This approach requires careful -management to ensure consistency and repeatability, providing detailed control -at the expense of simplicity and predictability. - -The expressiveness of imperative tools allows for stronger assumptions about the -environment's current state, which increases the likelihood of configuration -drift as environments diverge over time. To achieve consistency in an imperative -paradigm, it often necessitates extensive error handling, validation checks, and -retries, ensuring that despite the stepwise nature of the process, the system -reaches a stable end state. - -===== Declarative Configuration Management - -Declarative configuration management ensure idempotence, meaning the same -configuration can be applied multiple times without altering the environment -beyond its intended state. This abstraction simplifies understanding and -maintenance by allowing the system to determine the necessary actions to achieve -the desired state. Tools such as Puppet #cite(, form: "normal"), -Kubernetes #cite(,form: "normal"), -Terraform #cite(,form: "normal") and, under some conditions, -Docker #cite(, form: "normal") are used to specify the desired end -state. These tools typically feature their own specific #gls("DSL") to create -high-level descriptions of the desired environment's state, as opposed to -issuing imperative and procedural commands. The declarative approach mitigates -the risk of configuration drift by prioritising idempotence, maintaining -explicit dependency graphs, and ensuring a strong awareness of the current state -of the environment​​ #cite(,form:"normal", supplement: [p. 348]). - -While most configuration systems aim to be declarative to ensure reproducibility -and idempotency, some imperative tools can achieve a level of congruence. -However, this often comes at the cost of predictability and ease of maintenance, -making them less favourable in environments where stability and simplicity are -prioritised. - -#figure( - include "../../resources/typst/configuration-management-summary.typ", - caption: [Configuration Management Models and Paradigms], - kind: "table", - supplement: [Table], -) - -#info-box(kind: "note")[ - In @ch2-table-configuration-mgmt, Docker is classified as both declarative and - imperative. This dual classification arises from the fact that while Docker - often start with declarative configurations (e.g., a `Dockerfile`), it can - shift towards an imperative approach when imperative commands are introduced - within the `Dockerfile` to achieve the desired state. As a result, the same - `Dockerfile` may produce different outcomes depending on the base image in - use, leading to non-idempotent behaviour and ultimately hindering - reproducibility. -] - === Comparing Builds In the quest for software reproducibility, identifying and understanding the diff --git a/src/thesis/literature.bib b/src/thesis/literature.bib index 853e18a..85d1ffb 100644 --- a/src/thesis/literature.bib +++ b/src/thesis/literature.bib @@ -1074,3 +1074,17 @@ @misc{ThoughtsOnSystemsManagementMethods year = 2016, url = {https://flyingcircus.io/news/detailsansicht/thoughts-on-systems-management-methods} } + +@misc{python-dockerfile-repository, + title = {Python 3.12 Dockerfile}, + author = {docker-library project1}, + year = 2024, + url = {https://github.com/docker-library/python/blame/31bbb37b797bd5521d6622c6d54052d6d0ede585/3.12/bookworm/Dockerfile} +} + +@misc{dockerofficialimages, + title = {What are official images}, + author = {Docker Inc.}, + year = 2024, + url = {https://github.com/docker-library/official-images/blob/6b4803e65a2c56f15b91f8a11bd90f0bcb756c1c/README.md#what-are-official-images}, +}