From 88cb0863c47809887c1055bfb9d64a0fd547eac8 Mon Sep 17 00:00:00 2001 From: Antongiacomo Polimeno Date: Mon, 13 Nov 2023 16:55:54 +0100 Subject: [PATCH] minor instance --- ... Access Control - extension.code-workspace | 197 +++++++++++++++++- macro.tex | 1 + metrics.tex | 40 +--- policy.tex | 34 --- related.tex | 12 +- service_composition.tex | 91 +++++--- system_model.tex | 49 +++-- 7 files changed, 298 insertions(+), 126 deletions(-) delete mode 100644 policy.tex diff --git a/Big Data Access Control - extension.code-workspace b/Big Data Access Control - extension.code-workspace index 9611f33..bdeae39 100644 --- a/Big Data Access Control - extension.code-workspace +++ b/Big Data Access Control - extension.code-workspace @@ -8,6 +8,201 @@ "cSpell.words": [ "Jaccard" ], - "window.commandCenter": false + "window.commandCenter": false, + "files.exclude": { + "**/*.aux": true, + "**/*.lof": true, + "**/*.log": true, + "**/*.lot": true, + "**/*.fls": true, + "**/*.out": true, + "**/*.toc": true, + "**/*.fmt": true, + "**/*.fot": true, + "**/*.cb": true, + "**/*.cb2": true, + "**/.*.lb": true, + "**/*.dvi": true, + "**/*.xdv": true, + "**/*-converted-to.*": true, + "**/main.pdf": true, + "**/*.bbl": true, + "**/*.bcf": true, + "**/*.blg": true, + "**/*-blx.aux": true, + "**/*-blx.bib": true, + "**/*.run.xml": true, + "**/*.fdb_latexmk": true, + "**/*.synctex": true, + "**/*.synctex(busy)": true, + "**/*.synctex.gz": true, + "**/*.synctex.gz(busy)": true, + "**/*.pdfsync": true, + "**/latex.out/": true, + "**/*.alg": true, + "**/*.loa": true, + "**/acs-*.bib": true, + "**/*.thm": true, + "**/*.nav": true, + "**/*.pre": true, + "**/*.snm": true, + "**/*.vrb": true, + "**/*.soc": true, + "**/*.cut": true, + "**/*.cpt": true, + "**/*.spl": true, + "**/*.ent": true, + "**/*.lox": true, + "**/*.mf": true, + "**/*.mp": true, + "**/*.t[1-9]": true, + "**/*.t[1-9][0-9]": true, + "**/*.tfm": true, + "**/*.end": true, + "**/*.?end": true, + "**/*.[1-9]": true, + "**/*.[1-9][0-9]": true, + "**/*.[1-9][0-9][0-9]": true, + "**/*.[1-9]R": true, + "**/*.[1-9][0-9]R": true, + "**/*.[1-9][0-9][0-9]R": true, + "**/*.eledsec[1-9]": true, + "**/*.eledsec[1-9]R": true, + "**/*.eledsec[1-9][0-9]": true, + "**/*.eledsec[1-9][0-9]R": true, + "**/*.eledsec[1-9][0-9][0-9]": true, + "**/*.eledsec[1-9][0-9][0-9]R": true, + "**/*.acn": true, + "**/*.acr": true, + "**/*.glg": true, + "**/*.glo": true, + "**/*.gls": true, + "**/*.glsdefs": true, + "**/*.lzo": true, + "**/*.lzs": true, + "**/*.slg": true, + "**/*.slo": true, + "**/*.sls": true, + "**/*.gnuplot": true, + "**/*.table": true, + "**/*-gnuplottex-*": true, + "**/*.gaux": true, + "**/*.glog": true, + "**/*.gtex": true, + "**/*.4ct": true, + "**/*.4tc": true, + "**/*.idv": true, + "**/*.lg": true, + "**/*.trc": true, + "**/*.xref": true, + "**/*.brf": true, + "**/*-concordance.tex": true, + "**/*-tikzDictionary": true, + "**/*.lol": true, + "**/*.ltjruby": true, + "**/*.idx": true, + "**/*.ilg": true, + "**/*.ind": true, + "**/*.maf": true, + "**/*.mlf": true, + "**/*.mlt": true, + "**/*.mtc[0-9]*": true, + "**/*.slf[0-9]*": true, + "**/*.slt[0-9]*": true, + "**/*.stc[0-9]*": true, + "**/_minted*": true, + "**/*.pyg": true, + "**/*.mw": true, + "**/*.newpax": true, + "**/*.nlg": true, + "**/*.nlo": true, + "**/*.nls": true, + "**/*.pax": true, + "**/*.pdfpc": true, + "**/*.sagetex.sage": true, + "**/*.sagetex.py": true, + "**/*.sagetex.scmd": true, + "**/*.wrt": true, + "**/svg-inkscape/": true, + "**/*.sout": true, + "**/*.sympy": true, + "**/sympy-plots-for-*.tex/": true, + "**/*.upa": true, + "**/*.upb": true, + "**/*.pytxcode": true, + "**/pythontex-files-*/": true, + "**/*.listing": true, + "**/*.loe": true, + "**/*.dpth": true, + "**/*.md5": true, + "**/*.auxlock": true, + "**/*.ptc": true, + "**/*.tdo": true, + "**/*.hst": true, + "**/*.ver": true, + "**/*.lod": true, + "**/*.xcp": true, + "**/*.xmpi": true, + "**/*.xdy": true, + "**/*.xyc": true, + "**/*.xyd": true, + "**/*.ttt": true, + "**/*.fff": true, + "**/TSWLatexianTemp*": true, + "**/*.bak": true, + "**/*.sav": true, + "**/.texpadtmp": true, + "**/*.lyx~": true, + "**/*.backup": true, + "**/.*.swp": true, + "**/*~[0-9]*": true, + "**/*.tps": true, + "./auto/*": true, + "**/*.el": true, + "**/*-tags.tex": true, + "**/*.sta": true, + "**/*.lpz": true, + "**/*.xwm": true, + "**/*.vtc": true, + "**/*.glstex": true, + "**/.DS_Store": true, + "**/.AppleDouble": true, + "**/.LSOverride": true, + "**/Icon": true, + "**/._*": true, + "**/.DocumentRevisions-V100": true, + "**/.fseventsd": true, + "**/.Spotlight-V100": true, + "**/.TemporaryItems": true, + "**/.Trashes": true, + "**/.VolumeIcon.icns": true, + "**/.com.apple.timemachine.donotpresent": true, + "**/.AppleDB": true, + "**/.AppleDesktop": true, + "**/Network Trash Folder": true, + "**/Temporary Items": true, + "**/.apdisk": true, + "**/*.icloud": true, + ".vscode/*": true, + ".vscode/settings.json": false, + ".vscode/tasks.json": false, + ".vscode/launch.json": false, + ".vscode/extensions.json": false, + ".vscode/*.code-snippets": false, + "**/.history/": true, + "**/*.vsix": true, + "**/.history": true, + "**/.ionide": true, + "**/.venv/": true, + "*/*.csv": true, + ".venv/**/*": true, + ".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_x86.dylib": true, + ".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_x86_64.dylib": true, + ".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_linux_x86.o": true, + ".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_linux_x86_64.o": true, + ".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_frame_eval/pydevd_frame_evaluator.*.so": true, + ".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_frame_eval/pydevd_frame_evaluator.*.pyd": true, + ".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_frame_eval/pydevd_frame_evaluator.pyx": true + } } } \ No newline at end of file diff --git a/macro.tex b/macro.tex index 3bbeae3..9013366 100644 --- a/macro.tex +++ b/macro.tex @@ -15,6 +15,7 @@ \newcommand{\T}{\ensuremath{T}} \newcommand{\TF}[1]{\ensuremath{T^F_{#1}}} +\newcommand{\tf}[1]{\ensuremath{t^f_{#1}}} \newcommand{\TP}{\ensuremath{T^{P}}} \newcommand{\G}{\ensuremath{G}} diff --git a/metrics.tex b/metrics.tex index c11ed6d..dc7583f 100644 --- a/metrics.tex +++ b/metrics.tex @@ -1,22 +1,5 @@ \section{Heuristics}\label{sec:coalition} -% Coalition building is a crucial process having direct impact on the quality of the analytics results. %Figure~\ref{fig:smet} -% Figure X shows how data lineage is impacted by the processing lineage and in particular by i) the \textit{coalition agreement} $\textit{CA}_C$ (i.e., the CA-driven transformations adopted for a give coalition) and by ii) the transformation produced by the different jobs (job-specific transformation) part of a given coalition \coalition{}. -% Let us consider job $\job{1}^{\org{1}}$ of %Figure~\ref{fig:smet} -% Figure X it receives as input the data \trans{1}(\dataset{1}) based on the dataset obtained by \dataset{1} after the transformation \trans{1} which is associated to the data lineage by our AC model. It then produce a data that is the job-specific transformation on the input data (i.e., \trans{1}(\dataset{1})) generating \dataset{2}. -% We note that our Big Data Analytics pipeline models includes alternatives allowing different processing lineage (linear independent path in the Big data graph G) doing the same analytics but using different jobs (e.g., a lineage including k-means or a lineage using c-means). This will lead to different job-specific transformation on the data for the same Big Data pipeline. -% In this paper, for the sake of simplicity we i) consider different coalitions for each processing lineage, ii) coalitions made of trustworthy organizations \org{i} providing candidate services for each job and iii) job-specific transformation not influenced by the organizations' behavior. -% In this scenario, since any coalition of a given processing lineage will produce the same job-specific data transformation, the analytics pipeline quality is impacted only by the \textit{coalition agreement} $\textit{CA}_C$ or rather by the transformations \trans{i} imposed by the given coalition \coalition{} on the data lineage. -% In the following we first present metrics to evaluate data quality across the data lineage, and then a set of solutions to build coalitions for given Big Data pipeline ensuring a given data quality. - -% %\begin{example}\label{ex:p1j} -% %The choice of the specific deployment has an impact on the way in which the coalition \coalition{} of organizations \org{i} is formed as discussed in the following of this section. -% %Let us consider the following example where we have a pipeline made of just one ingestion job that can be offered by service provider $s_1$ or by the service provider $s_2\] In case the $s_1$ is selected the transformation $T_1$ is triggered according to the authorization $s_1$ has on the data, in this example $s_1$ has full control meaning that transformation $t_1$ is empty. In case the $s_2$ is selected the transformation $T_2$ is triggered according to the authorization $s_2$ has on the data and in this example data labelled as PII are removed. -% %\end{example} -% %Considering the two data lineage generated by the two different coalition in Example\ref{ex:p1j} the one involving $s_2$ produce a significant changes to data compared to the other one. This data changes can have direct impact on the quality of the analytics outcomes, therefore our goal is to build coalitions ensuring specific data quality. This coalition building problem can be assimilated to xxx showing an exponential complexity ... -% %In the following we fist introduce our data quality metrics and then our heuristics to solve the problem of coalition building - -% %\subsection{Data Quality metrics} -%\subsection{Coalition Heuristics} + \subsection{Metrics}\label{sec:metrics} Data quality is a largely studied topic for the database management research communities, @@ -29,7 +12,6 @@ \subsection{Metrics}\label{sec:metrics} In the following we present a set of metrics to evaluate the quality of the data at each step of the big data pipeline. -We The proposed metrics can be classified into two categories, namely quantitative and statistical. Initially, these metrics are applied to the original dataset (X) without any transformations, and subsequently, they are applied to the transformed dataset (Y). @@ -48,29 +30,13 @@ \subsubsection{Jaccard coefficient} Unlike other similarity measures, such as Euclidean distance, Jaccard coefficient is not affected by the magnitude of the values in the dataset. This property makes it suitable for datasets with categorical variables or nominal data, where the values do not have a meaningful numerical interpretation. -\subsubsection{Jaccard coefficent with weights} Let us consider two dataset X and Y of the same size. -The Jaccard coefficent is defined as:\[J(X,Y) = \frac{\sum_{i=1}^{n}w_i(x_i \cap y_i)}{\sum_{i=1}^{n}w_i(x_i \cup y_i)}\] +\subsubsection{Jaccard coefficient with weights} Let us consider two dataset X and Y of the same size. +The Jaccard coefficient is defined as:\[J(X,Y) = \frac{\sum_{i=1}^{n}w_i(x_i \cap y_i)}{\sum_{i=1}^{n}w_i(x_i \cup y_i)}\] Which is computed by dividing the cardinality of the intersection of two sets by the cardinality of their union, weighted by the weights assigned to the elements in the sets. Weights allow for the prioritization of certain features or elements in the datasets. This approach can be particularly useful when some elements in the dataset have more importance or relevance than others. By assigning weights to the elements, the weighted Jaccard similarity can account for this importance and provide a more accurate measure of similarity. -% \subsubsection{Kullback-Leibler divergence} -% Let us consider two dataset X and Y of the same size. -% The KL divergence is defined as:\[KL(X,Y) = \sum_{i=1}^{n}x_i \log \frac{x_i}{y_i}\] -% Which is computed by taking the sum of the product of each element in the first dataset and the logarithm of the ratio of the same element in the second dataset. -% The KL divergence is a measure of the difference between two probability distributions and is useful for comparing the dissimilarity of two datasets. - - -% \subsubsection{Kullback-Leibler divergence with weights} Let us consider two dataset X and Y of the same size. The weighted KL divergence is defined as: - -% \[KL(X,Y) = \sum_{i=1}^{n}w_i(x_i \log \frac{x_i}{y_i})\] - -% The weighted KL divergence is a variant of the KL divergence that incorporates weights to the elements in the datasets being compared. -% It allows for the prioritization of certain features or elements in the datasets. -% This approach can be particularly useful when some elements in the dataset have more importance or relevance than others. -% By assigning weights to the elements, the weighted KL divergence can account for this importance and provide a more accurate measure of dissimilarity. - \subsubsection{Jensen-Shannon Divergence} Let us consider two datasets X and Y of the same size. The Jensen-Shannon divergence (JSD) is a symmetrized version of the KL divergence and can be used to measure the dissimilarity between the two probability distributions. diff --git a/policy.tex b/policy.tex deleted file mode 100644 index 081b675..0000000 --- a/policy.tex +++ /dev/null @@ -1,34 +0,0 @@ -\section{Architectural Deployment}\label{sec:architecture} -We present the architectural deployment of our access control system discussing two possible approaches: \emph{i)} centralized deployment; \emph{ii)} decentralized deployment. The choice of the specific deployment has an impact on the way in which the coalition \coalition{} of organizations \org{i} is formed as discussed in the following of this section. - -\begin{figure*}[!t] - \begin{tabular}{cc} - \parbox[]{10cm}{~\\~\\~\\~\\~\\~\\~\\~\\~\\~\\}&\parbox[]{10cm}{~\\~\\~\\~\\~\\~\\~\\~\\~\\~\\}\\ - (a)&(b)\\ - \end{tabular} - \caption{Centralized (a) and decentralized (b) deployment} -\end{figure*} - -\subsection{Centralized Deployment}\label{sec:centralized} -A centralized deployment implements a service orchestration, where an orchestrator \orc\ provides access control functionalities and mediates access from organization \org{i} in \coalition{} to a given dataset \dataset{}. A service orchestration can be formally defined as follows. - -\begin{definition}\label{def:orchestration} - Given a big data analytics pipeline \G(\V,\E) in Definition \ref{def:pipeline}, a coalition \coalition{} of organizations \org{i}$\in$\Org{} each implementing a job \job{i}, a dataset \dataset{}, and an orchestrator \orc{}, a service orchestration is a direct acyclic graph \G$^c$(\V$^c$,\E$^c$), where \V$^c$=\V$_I$$\cup$\orc{} and \E$^c$=\{\vi{i},\orc{}\}$\cup$\{\orc{},\vi{i}\}, with \vi{i}$\in$\V$^c$ modeling a job \job{i}. -\end{definition} - -We note that each communication between two subsequent jobs \job{i-1} and \job{i} in \coalition{} is mediated by the orchestrator, which enforces all applicable policies. We also note that vertices \vi{c} and \vi{m} of an alternative structure, as well as \vi{f} and \vi{j} of a parallel structure, are included in the orchestrator \orc{}. Two enforcement processes are implemented in the centralized deployment as follows. -\begin{enumerate} - \item \textbf{Incoming enforcement.} Policy enforcement is done by the orchestrator before dataset \dataset{} is released to a specific (set of) job \job{i}. It is then executed for each edge \{\orc{},\job{i}\} and decrease or maintain the utility of the dataset (i.e., the enforcement has no impact on the dataset or remove some information). - \item \textbf{Outgoing enforcement.} Policy enforcement is done by the orchestrator after a resulting dataset \dataset{} is returned by a specific (set of) job \job{i}. It is then executed for each edge \{\job{i},\orc{}\} and aims to restore those data that were manipulated before access by \job{i-1}. In other words, once the resulting dataset is returned by \job{i-1}, orchestrator \orc{} restore those data that were not accessible by \job{i-1} (e.g., by deanonymizing it). -\end{enumerate} - -We note that centralized deployment maximizes the utility of the dataset providing each job with the largest amount of data possible, meaning that data transformation assumes a non-monotonic behavior. It is a single point of failure and assumes all jobs to coexist in a single environment. Generalization to multiple environments is possible but outside the scope of this paper. - -\subsection{Decentralized Deployment}\label{sec:decentralized} -A decentralized deployment implements a service choreography, where organization \org{i} in \coalition{} are directly connected and exchange data. A service choreography can be formally defined as follows. - -\begin{definition}\label{def:choreography} - Given a big data analytics pipeline \G(\V,\E) in Definition \ref{def:pipeline}, a coalition \coalition{} of organizations \org{i}$\in$\Org{} each implementing a job \job{i}, and a dataset \dataset{}, a service choreography is a direct acyclic graph \G$^d$(\V$^d$,\E$^d$), where \V$^d$=\V\ and \E$^d$=\{\vi{i},\vi{j}\} with \vi{i}$\in$\V$^d$ modeling a job \job{i}. -\end{definition} - -We note that each communication between two subsequent jobs \job{i-1} and \job{i} in \coalition{} is direct with no mediation. Each job is complemented with an external plugin enforcing all applicable policies. Policy enforcement is done by the plugin of \job{i-1} before dataset \dataset{} is released to job \job{i}. It is then executed for each edge \{\job{i-1},\job{i}\}, enforcing all applicable policies and decrease or maintain the utility of the dataset (i.e., the enforcement has no impact on the dataset or remove some information). We note that decentralized deployment provides a data transformation assuming a not-increasing monotonic behavior. Communications are distributed among different job platforms requiring data transfer during analytics. This choice could decrease performance when huge datasets must be distributed. diff --git a/related.tex b/related.tex index 300151e..107ee32 100644 --- a/related.tex +++ b/related.tex @@ -1,8 +1,8 @@ \section{Related Work}\label{sec:related} -\begin{itemize} -\item We believe that the closest approach to the one in this paper is the work of Hu et al. \cite{HUFerraiolo:2014}, introducing a generalised access control model for big data processing frameworks, which can be extended to the Hadoop environment. However, the paper discusses the issues only from a high-level architectural point of view, without discussing a tangible solution. -\item \cite{GuardSpark:ACSAC:2020} purpose-aware access control model, where purposes are data processing purpose and data operation purpose; the enforcement mechanism, still based on yes/no answer is based on an algorithm that checks if the operation on data to be performed matches to the purpose. The examples are given only for structured data and SQL queries. E se da una parte fa piu' di altri, dall'altra non ci sono attributi associati ai soggetti e agli oggetti, cosa che limita un pochino. -\item \cite{Sandhu:ABAC:2018} propose a solution specifically tailored to the Apache Hadoop stack, una semplice formalizzazione dell'AC in Hadoop. Non considerano la messa in sicurezza dell'ingestion time e non considerano la questione delle coalizioni. Considerano solo servizi all'interno di Hadoop ecosystem. Classica risposta yes/no. -\item \cite{ABACforHBase:2019} questo e' solo su HBase -\end{itemize} \ No newline at end of file +% \begin{itemize} +% \item We believe that the closest approach to the one in this paper is the work of Hu et al. \cite{HUFerraiolo:2014}, introducing a generalised access control model for big data processing frameworks, which can be extended to the Hadoop environment. However, the paper discusses the issues only from a high-level architectural point of view, without discussing a tangible solution. +% \item \cite{GuardSpark:ACSAC:2020} purpose-aware access control model, where purposes are data processing purpose and data operation purpose; the enforcement mechanism, still based on yes/no answer is based on an algorithm that checks if the operation on data to be performed matches to the purpose. The examples are given only for structured data and SQL queries. E se da una parte fa piu' di altri, dall'altra non ci sono attributi associati ai soggetti e agli oggetti, cosa che limita un pochino. +% \item \cite{Sandhu:ABAC:2018} propose a solution specifically tailored to the Apache Hadoop stack, una semplice formalizzazione dell'AC in Hadoop. Non considerano la messa in sicurezza dell'ingestion time e non considerano la questione delle coalizioni. Considerano solo servizi all'interno di Hadoop ecosystem. Classica risposta yes/no. +% \item \cite{ABACforHBase:2019} questo e' solo su HBase +% \end{itemize} \ No newline at end of file diff --git a/service_composition.tex b/service_composition.tex index 962dcf9..52cc5c6 100644 --- a/service_composition.tex +++ b/service_composition.tex @@ -74,6 +74,36 @@ \subsection{Pipeline Template Definition}\label{sec:templatedefinition} \label{fig:service_composition_template} \end{figure} + + +\begin{figure}[ht!] + \centering + \begin{tikzpicture}[scale=0.9] + % Nodes + \node[draw ] (node1) at (0,1) {$\s{r}$}; + \node[draw] (node2) at (0,2){$Data preparation $}; + \node[draw] (node3) at (0,3) {$\timesOperator$}; + \node[draw] (node4) at (-2,4) {$statistics$}; + \node[draw] (node5) at (2,4) {$Machine Learning$}; + \node[draw] (node6) at (0,5) {$\timesOperator$}; + \node[draw] (node65) at (0,6) {$Storage$}; + + \draw[->] (node1) -- (node2); + \draw[->] (node2) -- (node3); + \draw[->] (node3) -- (node4); + \draw[->] (node3) -- (node5); + \draw[->] (node5) -- (node6); + \draw[->] (node4) -- (node6); + \draw[->] (node6) -- (node65); + + + \end{tikzpicture} + \caption{Pipeline Template} + \label{fig:service_composition_template} +\end{figure} + + + \begin{figure}[ht!] \centering \begin{tikzpicture} @@ -102,7 +132,7 @@ \subsection{Data Protection Annotation \myLambda}\label{sec:nonfuncannotation} A \emph{Policy Condition} is a Boolean expression of the form $($\emph{attr\_name} op \emph{attr\_value}$)$, with op$\in$\{$<$,$>$,$=$,$\neq$,$\leq$,$\geq$\}, \emph{attr\_name} an attribute label, and \emph{attr\_value} the corresponding attribute value. \end{definition} -An access control policy then specifies who (\emph{subject}) can access what (\emph{object}) with action (\emph{action}), in a specific context (\emph{environment}) and under specific obligations (\emph{data transformation}), as formally defined below. +An access control policy then specifies who (\emph{subject}) can access what (\emph{object}) with action (\emph{action}), in a specific context (\emph{environment}) and under specific obligations (\emph{data transformation}), as formally defined below. \begin{definition}[Policy]\label{def:policy_rule} A {\it policy P} is 5-uple $<$\textit{subj}, \textit{obj}, \textit{action}, \textit{env}, \textit{\TP}$>$, where: @@ -115,7 +145,7 @@ \subsection{Data Protection Annotation \myLambda}\label{sec:nonfuncannotation} \item Environment \textit{env} defines a set of conditions on contextual attributes, such as time of the day, location, IP address, risk level, weather condition, holiday/workday, emergency. It is a set \emph{PC} of \emph{Policy Conditions} as defined in Definition \ref{def:policy_cond}. For instance, $<$\emph{env},\{(time $=$ "night")\}$>$ refers to a policy that is applicable only at night. - \item Data Transformation \textit{\TP} defines a set of security and privacy-aware transformations on \textit{obj}, which must be enforced before any access to data. Transformations focus on data protection, as well as compliance to regulations and standards, in addition to simple format conversions. + \item Data Transformation \textit{\TP} defines a set of security and privacy-aware transformations on \textit{obj}, which must be enforced before any access to data. Transformations focus on data protection, as well as compliance to regulations and standards, in addition to simple format conversions. \end{description} \end{definition} @@ -123,38 +153,35 @@ \subsection{Data Protection Annotation \myLambda}\label{sec:nonfuncannotation} \subsection{Functional Annotations}\label{sec:funcannotation} A proper data management approach must track functional data manipulations across the entire pipeline execution, defining the functional requirements of each service operating on data. -To this aim, each vertex \vi{i}$\in$\V$_S$ is annotated with a label \myGamma(\vi{i}), corresponding to the functional description $F_i$ of the service $s_i$ represented by \vi{i}. +To this aim, each vertex \vi{i}$\in\V_S$ is annotated with a label \myGamma(\vi{i}), corresponding to the functional description $F_i$ of the service $s_i$ represented by \vi{i}. $F_i$ describes the functional requirements on the corresponding service $s_i$, such as API, inputs, expected outputs. %The latter is modeled as a functional transformation function \TF\ that is applied to the data when executing service $s_i$. \TF\ has a twofold role: %\begin{enumerate}[label=\roman*)] % \item it contains the functional requirements that the service must satisfy, in terms of expected input, expected output, prototype and other functional aspects. % \item -It also specifies the specific data transformation function \TF{}, triggered as the result of a service execution. %applied to the data when executing service $s_i$. - -This function can be classified according to four types. +It also specifies a set \TF{} of data transformation functions \tf{i}, possibly triggered during execution of the connected service $s_i$. +Each $\tf{i}\in\TF{}$ can be classified as one of the following: \begin{enumerate*}[label=\roman*)] - \item Function \TF{\epsilon}, an empty function that applies no transformation or processing on the data. - \item Function \TF{a}, an additive function that expands the amount of data received, for example, by integrating data from other sources. - \item Function \TF{t}, a transformation function that transforms some records in the dataset without altering the domain. - \item Function \TF{d} (out of the scope of this work), a transformation function that changes the domain of the data by applying, for instance, PCA or K-means. + \item Function \tf{\epsilon}, an empty function that applies no transformation or processing on the data. + \item Function \tf{a}, an additive function that expands the amount of data received, for example, by integrating data from other sources. + \item Function \tf{t}, a transformation function that transforms some records in the dataset without altering the domain. + \item Function \tf{d} (out of the scope of this work), a transformation function that changes the domain of the data by applying, for instance, PCA or K-means. \end{enumerate*} -%A transformation function can be classified according to four types. -% \begin{enumerate*}[label=\roman*)] -% \item Function \F{e}, an empty function that applies no transformation or processing on the data. -% \item Function \F{a}, an additive function that expands the amount of data received, for example, by integrating data from other sources. -% \item Function \F{t}, a transformation function that transforms some records in the dataset without altering the domain. -% \item Function \F{d}, a transformation function that changes the domain of the data by applying, for instance, PCA or K-means (out of the scope of this work). -% \end{enumerate*} +For simplicity, without loss of generality, it is assumed that all candidate services meet functional annotation \F{} and that \TF{}=\tf{}, resulting in the consideration of only one transformation. +Therefore, all candidate services apply the same transformation to data during execution. + \subsection{Example}\label{sec:example} As an example, let us consider a pipeline template $G^{\myLambda,\myGamma}$ with three vertices, as depicted in \cref{fig:service_composition_example}. -It includes three key stages in our reference scenario: data anonymization (\vi{1}), data enrichment (\vi{2}), and data aggregation (\vi{3}), each stage with its policy $p$ and functional description \F. - +It includes three key stages in our reference scenario: data preparation (\vi{1}), data enrichment (\vi{2}), and data storage (\vi{3}), each stage with its policy $p$ and functional description \F. +In table x we report the policies and functional descriptions for each vertex. +% %\begin{enumerate*}[label=n\arabic*)] % \item -The first vertex (\vi{1}) is responsible for data anonymization. +The first vertex (\vi{1}) is responsible for data preparation. It specifies an anonymization policy ($\myLambda(v_1)$) to protect sensitive information, such as personally identifiable information (PII) in the dataset. -The transformation function \TF{1} in $\myGamma(v_1)$ is an empty function \TF{a}, as no functional transformation is required for anonymization. +The polic +The functional annotation \TF{1} in $\myGamma(v_1)$ is a transformation function \TF{t}, % \item The second vertex (\vi{2}) focuses on data enrichment, where additional information from the states of New York and New Hampshire is integrated into the dataset. @@ -215,7 +242,7 @@ \subsection{Example}\label{sec:example} % \subsection{Instance} % \hl{ANCHE QUA COME PER IL TEMPLATE PROVEREI A ESSERE UN POCO PIU' FORMALE. GUARDA IL PAPER CHE TI HO PASSATO.} We define a \pipeline instantiation technique as a function that takes as input a \pipelineTemplate \tChartFunction and a set $S^c$ of compatible services, one for each vertex \vi{i}$\in$\V, and returns as output a \pipelineInstance \iChartFunction. We recall that compatible services $S^c_i$ are candidate services satisfying data protection annotations \myLambda(\vi{i}), for each \vi{i}$\in$$\V_S$. - In \iChartFunction, every invocations $\vi{i}$$\in$\V$_S$ contains a service instance, and every branching $v\in\Vplus\bigcup\Vtimes$ is maintained as it is. We formally define our \pipelineInstance as follows. + In \iChartFunction, every invocations $\vi{i}$$\in$\V$_S$ contains a service instance, and every branching $v\in\Vplus\bigcup\Vtimes$ is maintained as it is. We formally define our \pipelineInstance as follows. \begin{definition}[Pipeline Instance]\label{def:instance} Let \tChartFunction be a pipeline template, a pipeline Instance $\iChartFunction$ is a directed acyclic graph where: @@ -233,22 +260,20 @@ \subsection{Example}\label{sec:example} Condition 1 is needed to preserve the process functionality, as it simply states that each service $s'_i$ must satisfy the functional requirements $F_i$ of the corresponding vertex \vi{i} in the \pipelineTemplate. Condition 2 states that each service $s'_i$ must satisfy the policy requirements \P{i} of the corresponding vertex \vi{i} in the \pipelineTemplate. - We assume that Condition 1 is satisfied for all candidate services and therefore concentrate on Condition 2 in the following. - - % Le considerazioni che seguono partono dall'assunto che T sia uguale a T_p U T_f senza lack of generality + As assumed in section pinco Condition 1 is satisfied for all candidate services and therefore concentrate on Condition 2 in the following. The \pipelineInstance is generated by traversing the \pipelineTemplate with a breadth-first search algorithm, starting from the root vertex \vi{r}. Then for each vertex \vi{i} in the pipeline template, the corresponding vertex \vii{i}$\in$\Vp\ is generated. Finally, for each vertex \vii{i}$\in$\Vp, a two-step selection approach is applied as follows. \begin{itemize} - \item \textit{Filtering Algorithm} -- As already discussed in Section~\ref{sec:templatedefinition}, filtering algorithm retrieves a set of candidate services and match them one-by-one against data protection requirements \myLambda(\vi{i}). In particular, the profile of each candidate service \si{j} is matched against policy \P{i} corresponding to \myLambda(\vi{i}). Filtering algorithm returns as output the set of compatible services that match the policy. - - Formally, let us consider a set $S^c$ of candidate services \si{j}, each one annotated with a profile. The filtering algorithm is executed for each \si{j}; it is successful if \si{j}'s profile satisfies \myLambda(\vi{i}) as the access control policy \P{i}; otherwise, \si{j} is discarded and not considered for selection. The filtering algorithm finally returns a subset $S'\subseteq S^c$ of compatible services, which represent the possible candidates for selection. + \item \textit{Filtering Algorithm} -- As already discussed in Section~\ref{sec:templatedefinition}, filtering algorithm retrieves a set of candidate services and match them one-by-one against data protection requirements \myLambda(\vi{i}). In particular, the profile of each candidate service \si{j} is matched against policy \P{i} corresponding to \myLambda(\vi{i}). Filtering algorithm returns as output the set of compatible services that match the policy. + + Formally, let us consider a set $S^c$ of candidate services \si{j}, each one annotated with a profile. The filtering algorithm is executed for each \si{j}; it is successful if \si{j}'s profile satisfies \myLambda(\vi{i}) as the access control policy \P{i}; otherwise, \si{j} is discarded and not considered for selection. The filtering algorithm finally returns a subset $S'\subseteq S^c$ of compatible services, which represent the possible candidates for selection. - \item \textit{Comparison Algorithm} - Upon retrieving a set $S'$ of compatible services \si{j}, it produces a ranking of these services according to some metrics that evaluates the quality loss introduced by each service when integrated in the pipeline instance. More details about the metrics are provided in Section \ref{sec:metrics}. - %Formally, compatible services \si{j}$\in$S' are ranked on the basis of a scoring function. - The best service \si{j} is then selected and integrated in $\vii{i}\in \Vp$. There are many ways of choosing relevant metrics, we present those used in this article in Section \ref{sec:metrics}. + \item \textit{Comparison Algorithm} - Upon retrieving a set $S'$ of compatible services \si{j}, it produces a ranking of these services according to some metrics that evaluates the quality loss introduced by each service when integrated in the pipeline instance. More details about the metrics are provided in Section \ref{sec:metrics}. + %Formally, compatible services \si{j}$\in$S' are ranked on the basis of a scoring function. + The best service \si{j} is then selected and integrated in $\vii{i}\in \Vp$. There are many ways of choosing relevant metrics, we present those used in this article in Section \ref{sec:metrics}. \end{itemize} When all vertices $\vi \in V$ have been visited, G' contains a service instance $s'_i$ for each \vii{i}$\in$\Vp, and the \pipelineInstance is complete. We note that each vertex \vii{i} is annotated with a policy \P{i} according to \myLambda. When pipeline instance is triggered, before any services can be executed, policy \P{i} is evaluated and enforced. In case, policy evaluation is \emph{true}, data transformation \TP\ is applied, otherwise a default transformation that delete all data is applied. @@ -262,8 +287,8 @@ \subsection{Example}\label{sec:example} Services $s_1$ and $s_2$ are annotated with a profile that satisfies the data protection requirements in \P{1} and \P{2}, respectively. The third service $s_3$ is annotated with a profile that does not satisfy the data protection requirements in \P{3}. The filtering algorithm then returns the set $S'=\{s_1,s_2\}$. - The comparison algorithm is fnally applied to $S'$ and returns a ranking of the services according to quality metrics, where $s_1$ is ranked first. $s_1$ is then selected and integrated in $\vii{1}\in \Vp$. - + The comparison algorithm is fnally applied to $S'$ and returns a ranking of the services according to quality metrics, where $s_1$ is ranked first. $s_1$ is then selected and integrated in $\vii{1}\in \Vp$. + The same logic is applied to the \vi{2} and \vi{3}. \end{example} diff --git a/system_model.tex b/system_model.tex index 0913808..269790a 100644 --- a/system_model.tex +++ b/system_model.tex @@ -16,36 +16,55 @@ \subsection{System Model}\label{sec:systemmodel} \begin{description} \item[Service,] a software distributed by a \textbf{service provider} that performs a specific task according to access control privileges on data; %, a service can be tagged with some policies %, a service is characterized by two function: the service function and the policy function. \item[Pipeline,] a sequence of connected services that collect, prepare, process, and analyze data in a structured and automated manner. We distinguish between a \textbf{pipeline template} that acts as a skeleton, specifying the structure of the pipeline and the (non-)functional requirements driving service selection, and a \textbf{pipeline instance} instantiating the template with services according to the specified requirements; - \item[Data Governance Policy], a structured set of confidentiality guidelines, rules, and procedures regulating data access and protection; + \item[Data Governance Policy,] a structured set of confidentiality guidelines, rules, and procedures regulating data access and protection; \item[User] that executes an analytics pipeline on the data. We assume that data target of the analytics pipeline are ready for analysis, that is, they underwent a preparatory phase addressing issues such as missing values, outliers, and formatting discrepancies. This ensures that the data are in an optimal state for subsequent analysis. \end{description} -The \user starts its analytics by first selecting a pipeline template among a set of functionally-equivalent templates. The template is selected according to the \user\ non-functional requirements and then instantiated in a pipeline instance. In particular, for each component service in the template, a real service is selected among a list of candidate services in the instance. Candidate services are functionally equivalent and comply with the privacy policies specified in the template. +The \user starts its analytics by first selecting a pipeline template among a set of functionally-equivalent templates. +The template is selected according to the \user\ non-functional requirements and then instantiated in a pipeline instance. +\hl{In particular, for each component service in the template, a real service is selected among a list of candidate services in the instance. + Candidate services are functionally equivalent and comply with the privacy policies specified in the template}. + Candidate services are ranked based on their ability to retain the maximum amount of information (\emph{data quality} in this paper), while maintaining a minimum level of privacy. -%Priority is given to the service that maximizes data quality, while maintaining the same level of privacy. This evaluation process aids in identifying the most suitable service for a particular step in the pipeline. Upon selecting the most suitable service for each component service in the pipeline template, the pipeline instance is completed and ready for execution. -%Our goal is not to formulate novel service composition algorithms, but rather to establish a data governance framework. +It is important to note that our data governance approach builds on the following assumption: \emph{upholding a larger quantity of data is linked to better data quality.} +While this assumption is not true in all settings, it correctly represents many real-world scenarios. We leave a solution that departs from this assumption to our future work. -It is important to note that our data governance approach builds on the following assumption: \emph{upholding a larger quantity of data is linked to better data quality.} While this assumption is not true in all settings, it correctly represents many real-world scenarios. We leave a solution that departs from this assumption to our future work. -e \subsection{Service Pipeline and Reference Scenario}\label{sec:service_definition} -We consider a service-based environment where a service pipeline is designed to analyze -data. We define a service pipeline as a graph defined as follows. % and depicted in \cref{fig:service_pipeline}. +We consider a service-based environment where a service pipeline is designed to analyze data. +We define a service pipeline as a graph defined as follows. % and depicted in \cref{fig:service_pipeline}. \begin{definition}[\pipeline]\label{def:pipeline} - A \pipeline is as a direct acyclic graph G(\V,\E), where \V\ is a set of vertices and \E\ is a set of edges connecting two vertices \vi{i},\vi{k}$\in$\V. The graph has a root \vi{r}$\in$\V, a vertex \vi{i}$\in$\V$_S$ for each service $s_i$, two additional vertices \vi{c},\vi{m}$\in$\V$_{\timesOperator}$$\subset$\V\ for each alternative ($\timesOperator$) structure modeling the alternative execution (\emph{choice}) of operations and the retrieval (\emph{merge}) of the results, respectively, and two additional vertices \vi{f},\vi{j}$\in$\V$_{\plusOperator}$$\subset$\V\ for each parallel ($\plusOperator$) structure modeling the contemporary execution (\emph{fork}) of operations and the integration (\emph{join}) of their results, respectively. + A \pipeline is as a direct acyclic graph G(\V,\E), where \V\ is a set of vertices and \E\ is a set of edges connecting two vertices \vi{i},\vi{k}$\in$\V. + The graph has a root \vi{r}$\in$\V, a vertex \vi{i}$\in$\V$_S$ for each service $s_i$, two additional vertices \vi{c},\vi{m}$\in$\V$_{\timesOperator}$$\subset$\V\ for each alternative ($\timesOperator$) structure modeling the alternative execution (\emph{choice}) of operations and the retrieval (\emph{merge}) of the results, respectively, and two additional vertices \vi{f},\vi{j}$\in$\V$_{\plusOperator}$$\subset$\V\ for each parallel ($\plusOperator$) structure modeling the contemporary execution (\emph{fork}) of operations and the integration (\emph{join}) of their results, respectively. \end{definition} -We note that \{\vi{r}\}$\cup$\V$_S$$\cup$\V$_{\timesOperator}$$\cup$V$_{\plusOperator}$$=$\V, and \vi{c}, \vi{m}, \vi{f}, and \vi{j} model branching for alternative/parallel structures. We also note that root \vi{r} possibly represents the orchestrator. +We note that \{\vi{r}\}$\cup$\V$_S$$\cup$\V$_{\timesOperator}$$\cup$V$_{\plusOperator}$$=$\V, and \vi{c}, \vi{m}, \vi{f}, and \vi{j} model branching for alternative/parallel structures. +We also note that root \vi{r} possibly represents the orchestrator. % A service pipeline is as a direct acyclic graph G(\V,\E), where \V\ is a set of vertices, one for each service $s_i$ in the pipeline, \E\ is a set of edges connecting two services $s_i$ and $s_j$, and \myLambda\ is an annotation function that assigns a label \myLambda(\vi{i}), corresponding to a data transformation \F\ implemented by the service $s_i$, for each vertex \vi{i}$\in$\V. Our reference scenario considers a service pipeline analyzing a dataset of individuals detained in Department of Correction facilities in the state of Connecticut while awaiting trial. In particular, the user, a member of the Connecticut Department of Correction (DOC), seeks to compare admission trends in Connecticut prisons with those in other US states. The user's preferences align with a predefined pipeline template that orchestrates the following sequence of operations: -\emph{i)} Anonymization of the dataset. -\emph{ii)} Data enrichment, integrating data from the states of New York and New Hampshire. -\emph{iii)} Transformation of the dataset to derive state-specific data aggregations, including statistical measures like averages, medians, and clustering-based statistics. -\emph{iv)} Storage of the results in the corresponding states. Specifically, one copy remains in Connecticut (where sensitive information in the source dataset is not protected), while two additional copies are distributed to New York and New Hampshire (with sensitive information from the source dataset being safeguarded). +\begin{enumerate*}[label=(\roman*)] + \item \emph{Data preparation and protection}, including data cleaning and anonymization; + \item \emph{Data enrichment}, including the integration of data from other states; + \item \emph{Data analysis}, including statistical measures like averages, medians, and clustering-based statistics; + \item \emph{Machine learning task}, including training and inference; + \item \emph{Data storage}, including the storage of the results in the corresponding states. Specifically, one copy remains in Connecticut (where sensitive information in the source dataset is not protected), + while two additional copies are distributed to New York and New Hampshire (with sensitive information from the source dataset being safeguarded). +\end{enumerate*} + +\begin{enumerate*}[label=(\roman*)] + \item \emph{Data preparation and protection}, including data cleaning and anonymization; + \item \emph{Data enrichment}, including the integration of data from other states; + \item \emph{Data analysis}, including statistical measures like averages, medians, and clustering-based statistics; + \item \emph{Machine learning task}, including training and inference; + \item \emph{Data storage}, including the storage of the results in the corresponding states. Specifically, one copy remains in Connecticut (where sensitive information in the source dataset is not protected), + while two additional copies are distributed to New York and New Hampshire (with sensitive information from the source dataset being safeguarded). +\end{enumerate*} -We note that the template requires the execution of the entire service within a single country. If the data needs to be transmitted beyond the boundaries of Connecticut, data protection measures must be implemented. A visual representation of the flow is presented in Figure \ref{fig:service_composition_example}. +We note that the template requires the execution of the entire service within a single country. +If the data needs to be transmitted beyond the boundaries of Connecticut, data protection measures must be implemented. +A visual representation of the flow is presented in Figure \ref{fig:service_composition_example}. \begin{figure} \includegraphics[width=0.98\columnwidth]{service_composition_example}