diff --git a/resources/typst/configuration-management-summary.typ b/resources/typst/configuration-management-summary.typ new file mode 100644 index 0000000..3f33dd3 --- /dev/null +++ b/resources/typst/configuration-management-summary.typ @@ -0,0 +1,46 @@ +#import "../../src/thesis/imports/preamble.typ": * + + +#table( + columns: (1fr, 1fr, 1fr), + stroke: none, + table.header( + [], + table.vline(stroke: 1pt), + [#align(center)[Imperative]], + table.vline(stroke: .5pt), + [#align(center)[Declarative]], + table.hline(stroke: 1pt), + ), + table.cell(align: horizon + center)[Divergent], + [ + - Shell commands + ], + [ + - Shell scripts + - Ansible + ], + table.hline(stroke: .5pt), + table.cell(align: horizon + center, rowspan: 2)[Convergent], + table.cell(colspan: 2)[ + - Docker + ], + table.hline(stroke: .5pt + luma(200), start: 1), + [ + - Ansible + - Chef + - Shell scripts + ], + [ + - Puppet + - Kubernetes + - Terraform + ], + table.hline(stroke: .5pt), + table.cell(align: horizon + center)[Congruent], + [], + [ + - Nix + - Guix + ], +) diff --git a/resources/typst/configuration-management.typ b/resources/typst/configuration-management.typ new file mode 100644 index 0000000..4b8de6c --- /dev/null +++ b/resources/typst/configuration-management.typ @@ -0,0 +1,126 @@ +#import "../../src/thesis/imports/preamble.typ": * + +#grid( + columns: (1fr, 1fr, 1fr), + gutter: 1em, + [ + #set align(bottom) + #figure( + { + set text(font: "Virgil 3 YOFF") + cetz.canvas({ + import cetz.plot + import cetz.draw: content + + plot.plot( + size: (3.5, 3), + y-label: [state], + x-label: [Time], + axis-style: "school-book", + x-tick-step: none, + y-tick-step: none, + x-min: 0, + x-max: 500, + x-grid: true, + y-min: 0, + y-max: 500, + legend: "legend.north", + { + plot.add( + ((75, 75), (450, 300)), + mark: "o", + ) + plot.add( + ((75, 50), (450, 125)), + mark: "o", + style: (stroke: (paint: red, dash: "dashed")), + ) + }, + ) + }) + }, + caption: [Divergent], + ) + ], + [ + #set align(bottom) + #figure( + { + set text(font: "Virgil 3 YOFF") + cetz.canvas({ + import cetz.plot + import cetz.draw: * + + plot.plot( + size: (3.5, 3), + y-label: [State], + x-label: [Time], + axis-style: "school-book", + x-tick-step: none, + y-tick-step: none, + x-min: 0, + x-max: 500, + x-grid: true, + y-min: 0, + y-max: 500, + legend: "legend.north", + { + plot.add( + ((75, 75), (450, 300)), + style: (stroke: (paint: blue)), + mark: "o", + label: "actual", + ) + plot.add( + ((75, 500), (450, 325)), + mark: "o", + label: "target", + style: (stroke: (paint: red, dash: "dashed")), + ) + }, + ) + }) + }, + caption: [Convergent], + ) + ], + [ + #set align(bottom) + #figure( + { + set text(font: "Virgil 3 YOFF") + cetz.canvas({ + import cetz.plot + import cetz.draw: * + + plot.plot( + size: (3.5, 3), + y-label: [State], + x-label: [Time], + axis-style: "school-book", + x-tick-step: none, + y-tick-step: none, + x-min: 0, + x-max: 500, + x-grid: true, + y-min: 0, + y-max: 500, + legend: "legend.inner-south-east", + { + plot.add( + ((75, 75), (450, 300)), + mark: "o", + ) + plot.add( + ((75, 50), (450, 275)), + mark: "o", + style: (stroke: (paint: red, dash: "dashed")), + ) + }, + ) + }) + }, + caption: [Congruent], + ) + ], +) diff --git a/src/thesis/2-reproducibility.typ b/src/thesis/2-reproducibility.typ index 00381c1..39e5d2a 100644 --- a/src/thesis/2-reproducibility.typ +++ b/src/thesis/2-reproducibility.typ @@ -1718,6 +1718,163 @@ as `-u`, and the `LC_ALL` environment variable to the `date` command. This approach ensures that the output we receive is predictable and consistent, regardless of the underlying system configuration. +==== Environments and Configuration Management + +In the context of #gls("SE"), reproducibility not only relies on stable +codebases but also heavily depends on consistent and well-maintained +environments. Configuration management plays a critical role in ensuring +reproducibility by mitigating the non-deterministic behaviours introduced by +configuration drift. + +#info-box[ +Configuration drift occurs when changes to an environment +accumulate over time, leading to variations that deviate from the desired or +initial configuration state, thus introducing non-determinism. +] + +This section examines key configuration management models, +their impact on reproducibility, and the tools that enforce these principles in +modern software environments. + +Another source of non-determinism arises from inconsistent environment +configurations. The way environments are managed directly affects the +environment behaviours and inherently, reproducibility. Therefore, configuration +management plays an important role in mitigating non-determinism by ensuring +that systems, software installations and software builds remain consistent +across different environments. + +@Traugott2002 classify environment configuration management into three +categories, each of which has a distinct impact on the level of determinism +achieved: + +#figure(include "../../resources/typst/configuration-management.typ") + +===== Divergent Configuration Management + +In this model (@divergent-config-management), environments are typically managed +by one or more individuals, which inevitably leads to +#emph[configuration drift], where the configurations of different systems +deviate over time. This is an unavoidable process when system modifications are +performed without centralised control, leading to unpredictable and +non-deterministic behaviour, making reproducibility almost impossible in complex +infrastructures. Reducing reliance on manual adjustments is essential to +achieving higher levels of system predictability and reproducibility. +A common example of this model is a newly installed operating system that +initially shares a uniform configuration. Over time, as users customise their +environments to suit individual preferences, the system’s state diverges from +its original, well-defined configuration. + +===== Convergent Configuration Management + +Once configuration drift is identified as an issue, the focus shifts towards +convergence, bringing systems back to a known and consistent state, as +illustrated in @convergent-config-management. While efforts are made to +standardise configurations, achieving exact uniformity is extremely challenging, +if not impossible. Systems may progressively "converge" towards a common +configuration, but subtle differences can persist, introducing variability. The +goal in this model is to minimise these variations as much as possible, though +complete uniformity is rarely attained. To illustrate this model, we could think +of an arbitrary environment that needs to be configured in a specific way, reach +a particular well known state. For example, some specific dependencies has to be +installed. Tools like Puppet #cite(, form: "normal"), +Kubernetes #cite(,form: "normal"), +Terraform #cite(,form: "normal"), +Ansible #cite(, form: "normal"). +While convergent management offers flexibility in responding to unforeseen +changes in the environment, it is prone to feedback loops that may cause +unexpected behaviour​. Such feedback loops make it difficult to achieve complete +reproducibility, as the system's progression towards the desired state is not +guaranteed to follow a deterministic path. + +===== Congruent Configuration Management + +This approach in @congruent-config-management enforces strict consistency across +all environments, ensuring that each environment maintains an identical +configuration. By preventing configuration drift from the outset, congruent +configuration management aims to eliminate one of the key sources of +non-determinism. Maintaining identical setups across environments is a central +goal of this model, providing the highest level of determinism and reliability +in system behaviours. To illustrate this model, we could think of an arbitrary +environment that needs to be configured in a specific way. + +Congruent management, particularly through the adoption of immutable +environment, ensures that systems remain in a well-defined state, thus +maximising reproducibility. However, this approach can lack the flexibility +required for dynamic environments, where each minor adjustments may necessitate +rebuilding the entire system. This limitation highlights the importance of +carefully choosing between convergent and congruent approaches based on the +system's needs. + +Tools such as Nix or Guix have demonstrated that it is possible to achieve a +high degree of congruence while allowing controlled divergence in specific areas +such as databases or secret management​. This balance between convergence and +congruence highlights the flexibility required to maintain reproducibility in +environments that manage both static system components and dynamic data. + +On top of specifying configuration management models, we can also distinguish +two different configuration management paradigms. + +===== Imperative Configuration Management + +This paradigm specifies the exact steps required to transition an environment +from its current state to the desired state. Tools such as +Ansible #cite(, form: "normal"), Chef #cite(, form: "normal"), +Docker #cite(, form: "normal"), and shell scripts exemplify this +methodology. While imperative configurations enable the use of complex logic and +conditional operations, they can be challenging to maintain due to their +non-idempotent nature, meaning the same script may yield different results +depending on the environment's initial state. This approach requires careful +management to ensure consistency and repeatability, providing detailed control +at the expense of simplicity and predictability. + +The expressiveness of imperative tools allows for stronger assumptions about the +environment's current state, which increases the likelihood of configuration +drift as environments diverge over time. To achieve consistency in an imperative +paradigm, it often necessitates extensive error handling, validation checks, and +retries, ensuring that despite the stepwise nature of the process, the system +reaches a stable end state. + +===== Declarative Configuration Management + +Declarative configuration management ensure idempotence, meaning the same +configuration can be applied multiple times without altering the environment +beyond its intended state. This abstraction simplifies understanding and +maintenance by allowing the system to determine the necessary actions to achieve +the desired state. Tools such as Puppet #cite(, form: "normal"), +Kubernetes #cite(,form: "normal"), +Terraform #cite(,form: "normal") and, under some conditions, +Docker #cite(, form: "normal") are used to specify the desired end +state. These tools typically feature their own specific #gls("DSL") to create +high-level descriptions of the desired environment's state, as opposed to +issuing imperative and procedural commands. The declarative approach mitigates +the risk of configuration drift by prioritising idempotence, maintaining +explicit dependency graphs, and ensuring a strong awareness of the current state +of the environment​​ #cite(,form:"normal", supplement: [p. 348]). + +While most configuration systems aim to be declarative to ensure reproducibility +and idempotency, some imperative tools can achieve a level of congruence. +However, this often comes at the cost of predictability and ease of maintenance, +making them less favourable in environments where stability and simplicity are +prioritised. + +#figure( + include "../../resources/typst/configuration-management-summary.typ", + caption: [Configuration Management Models and Paradigms], + kind: "table", + supplement: [Table], +) + +#info-box(kind: "note")[ + In @ch2-table-configuration-mgmt, Docker is classified as both declarative and + imperative. This dual classification arises from the fact that while Docker + often start with declarative configurations (e.g., a `Dockerfile`), it can + shift towards an imperative approach when imperative commands are introduced + within the `Dockerfile` to achieve the desired state. As a result, the same + `Dockerfile` may produce different outcomes depending on the base image in + use, leading to non-idempotent behaviour and ultimately hindering + reproducibility. +] + === Comparing Builds In the quest for software reproducibility, identifying and understanding the