Skip to content

Commit

Permalink
minor instance
Browse files Browse the repository at this point in the history
  • Loading branch information
antongiacomo committed Nov 13, 2023
1 parent 42e91ad commit 88cb086
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 126 deletions.
197 changes: 196 additions & 1 deletion Big Data Access Control - extension.code-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,201 @@
"cSpell.words": [
"Jaccard"
],
"window.commandCenter": false
"window.commandCenter": false,
"files.exclude": {
"**/*.aux": true,
"**/*.lof": true,
"**/*.log": true,
"**/*.lot": true,
"**/*.fls": true,
"**/*.out": true,
"**/*.toc": true,
"**/*.fmt": true,
"**/*.fot": true,
"**/*.cb": true,
"**/*.cb2": true,
"**/.*.lb": true,
"**/*.dvi": true,
"**/*.xdv": true,
"**/*-converted-to.*": true,
"**/main.pdf": true,
"**/*.bbl": true,
"**/*.bcf": true,
"**/*.blg": true,
"**/*-blx.aux": true,
"**/*-blx.bib": true,
"**/*.run.xml": true,
"**/*.fdb_latexmk": true,
"**/*.synctex": true,
"**/*.synctex(busy)": true,
"**/*.synctex.gz": true,
"**/*.synctex.gz(busy)": true,
"**/*.pdfsync": true,
"**/latex.out/": true,
"**/*.alg": true,
"**/*.loa": true,
"**/acs-*.bib": true,
"**/*.thm": true,
"**/*.nav": true,
"**/*.pre": true,
"**/*.snm": true,
"**/*.vrb": true,
"**/*.soc": true,
"**/*.cut": true,
"**/*.cpt": true,
"**/*.spl": true,
"**/*.ent": true,
"**/*.lox": true,
"**/*.mf": true,
"**/*.mp": true,
"**/*.t[1-9]": true,
"**/*.t[1-9][0-9]": true,
"**/*.tfm": true,
"**/*.end": true,
"**/*.?end": true,
"**/*.[1-9]": true,
"**/*.[1-9][0-9]": true,
"**/*.[1-9][0-9][0-9]": true,
"**/*.[1-9]R": true,
"**/*.[1-9][0-9]R": true,
"**/*.[1-9][0-9][0-9]R": true,
"**/*.eledsec[1-9]": true,
"**/*.eledsec[1-9]R": true,
"**/*.eledsec[1-9][0-9]": true,
"**/*.eledsec[1-9][0-9]R": true,
"**/*.eledsec[1-9][0-9][0-9]": true,
"**/*.eledsec[1-9][0-9][0-9]R": true,
"**/*.acn": true,
"**/*.acr": true,
"**/*.glg": true,
"**/*.glo": true,
"**/*.gls": true,
"**/*.glsdefs": true,
"**/*.lzo": true,
"**/*.lzs": true,
"**/*.slg": true,
"**/*.slo": true,
"**/*.sls": true,
"**/*.gnuplot": true,
"**/*.table": true,
"**/*-gnuplottex-*": true,
"**/*.gaux": true,
"**/*.glog": true,
"**/*.gtex": true,
"**/*.4ct": true,
"**/*.4tc": true,
"**/*.idv": true,
"**/*.lg": true,
"**/*.trc": true,
"**/*.xref": true,
"**/*.brf": true,
"**/*-concordance.tex": true,
"**/*-tikzDictionary": true,
"**/*.lol": true,
"**/*.ltjruby": true,
"**/*.idx": true,
"**/*.ilg": true,
"**/*.ind": true,
"**/*.maf": true,
"**/*.mlf": true,
"**/*.mlt": true,
"**/*.mtc[0-9]*": true,
"**/*.slf[0-9]*": true,
"**/*.slt[0-9]*": true,
"**/*.stc[0-9]*": true,
"**/_minted*": true,
"**/*.pyg": true,
"**/*.mw": true,
"**/*.newpax": true,
"**/*.nlg": true,
"**/*.nlo": true,
"**/*.nls": true,
"**/*.pax": true,
"**/*.pdfpc": true,
"**/*.sagetex.sage": true,
"**/*.sagetex.py": true,
"**/*.sagetex.scmd": true,
"**/*.wrt": true,
"**/svg-inkscape/": true,
"**/*.sout": true,
"**/*.sympy": true,
"**/sympy-plots-for-*.tex/": true,
"**/*.upa": true,
"**/*.upb": true,
"**/*.pytxcode": true,
"**/pythontex-files-*/": true,
"**/*.listing": true,
"**/*.loe": true,
"**/*.dpth": true,
"**/*.md5": true,
"**/*.auxlock": true,
"**/*.ptc": true,
"**/*.tdo": true,
"**/*.hst": true,
"**/*.ver": true,
"**/*.lod": true,
"**/*.xcp": true,
"**/*.xmpi": true,
"**/*.xdy": true,
"**/*.xyc": true,
"**/*.xyd": true,
"**/*.ttt": true,
"**/*.fff": true,
"**/TSWLatexianTemp*": true,
"**/*.bak": true,
"**/*.sav": true,
"**/.texpadtmp": true,
"**/*.lyx~": true,
"**/*.backup": true,
"**/.*.swp": true,
"**/*~[0-9]*": true,
"**/*.tps": true,
"./auto/*": true,
"**/*.el": true,
"**/*-tags.tex": true,
"**/*.sta": true,
"**/*.lpz": true,
"**/*.xwm": true,
"**/*.vtc": true,
"**/*.glstex": true,
"**/.DS_Store": true,
"**/.AppleDouble": true,
"**/.LSOverride": true,
"**/Icon": true,
"**/._*": true,
"**/.DocumentRevisions-V100": true,
"**/.fseventsd": true,
"**/.Spotlight-V100": true,
"**/.TemporaryItems": true,
"**/.Trashes": true,
"**/.VolumeIcon.icns": true,
"**/.com.apple.timemachine.donotpresent": true,
"**/.AppleDB": true,
"**/.AppleDesktop": true,
"**/Network Trash Folder": true,
"**/Temporary Items": true,
"**/.apdisk": true,
"**/*.icloud": true,
".vscode/*": true,
".vscode/settings.json": false,
".vscode/tasks.json": false,
".vscode/launch.json": false,
".vscode/extensions.json": false,
".vscode/*.code-snippets": false,
"**/.history/": true,
"**/*.vsix": true,
"**/.history": true,
"**/.ionide": true,
"**/.venv/": true,
"*/*.csv": true,
".venv/**/*": true,
".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_x86.dylib": true,
".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_x86_64.dylib": true,
".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_linux_x86.o": true,
".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_linux_x86_64.o": true,
".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_frame_eval/pydevd_frame_evaluator.*.so": true,
".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_frame_eval/pydevd_frame_evaluator.*.pyd": true,
".venv/lib/python3.11/site-packages/debugpy/_vendored/pydevd/_pydevd_frame_eval/pydevd_frame_evaluator.pyx": true
}
}
}
1 change: 1 addition & 0 deletions macro.tex
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

\newcommand{\T}{\ensuremath{T}}
\newcommand{\TF}[1]{\ensuremath{T^F_{#1}}}
\newcommand{\tf}[1]{\ensuremath{t^f_{#1}}}
\newcommand{\TP}{\ensuremath{T^{P}}}

\newcommand{\G}{\ensuremath{G}}
Expand Down
40 changes: 3 additions & 37 deletions metrics.tex
Original file line number Diff line number Diff line change
@@ -1,22 +1,5 @@
\section{Heuristics}\label{sec:coalition}
% Coalition building is a crucial process having direct impact on the quality of the analytics results. %Figure~\ref{fig:smet}
% Figure X shows how data lineage is impacted by the processing lineage and in particular by i) the \textit{coalition agreement} $\textit{CA}_C$ (i.e., the CA-driven transformations adopted for a give coalition) and by ii) the transformation produced by the different jobs (job-specific transformation) part of a given coalition \coalition{}.
% Let us consider job $\job{1}^{\org{1}}$ of %Figure~\ref{fig:smet}
% Figure X it receives as input the data \trans{1}(\dataset{1}) based on the dataset obtained by \dataset{1} after the transformation \trans{1} which is associated to the data lineage by our AC model. It then produce a data that is the job-specific transformation on the input data (i.e., \trans{1}(\dataset{1})) generating \dataset{2}.
% We note that our Big Data Analytics pipeline models includes alternatives allowing different processing lineage (linear independent path in the Big data graph G) doing the same analytics but using different jobs (e.g., a lineage including k-means or a lineage using c-means). This will lead to different job-specific transformation on the data for the same Big Data pipeline.
% In this paper, for the sake of simplicity we i) consider different coalitions for each processing lineage, ii) coalitions made of trustworthy organizations \org{i} providing candidate services for each job and iii) job-specific transformation not influenced by the organizations' behavior.
% In this scenario, since any coalition of a given processing lineage will produce the same job-specific data transformation, the analytics pipeline quality is impacted only by the \textit{coalition agreement} $\textit{CA}_C$ or rather by the transformations \trans{i} imposed by the given coalition \coalition{} on the data lineage.
% In the following we first present metrics to evaluate data quality across the data lineage, and then a set of solutions to build coalitions for given Big Data pipeline ensuring a given data quality.

% %\begin{example}\label{ex:p1j}
% %The choice of the specific deployment has an impact on the way in which the coalition \coalition{} of organizations \org{i} is formed as discussed in the following of this section.
% %Let us consider the following example where we have a pipeline made of just one ingestion job that can be offered by service provider $s_1$ or by the service provider $s_2\] In case the $s_1$ is selected the transformation $T_1$ is triggered according to the authorization $s_1$ has on the data, in this example $s_1$ has full control meaning that transformation $t_1$ is empty. In case the $s_2$ is selected the transformation $T_2$ is triggered according to the authorization $s_2$ has on the data and in this example data labelled as PII are removed.
% %\end{example}
% %Considering the two data lineage generated by the two different coalition in Example\ref{ex:p1j} the one involving $s_2$ produce a significant changes to data compared to the other one. This data changes can have direct impact on the quality of the analytics outcomes, therefore our goal is to build coalitions ensuring specific data quality. This coalition building problem can be assimilated to xxx showing an exponential complexity ...
% %In the following we fist introduce our data quality metrics and then our heuristics to solve the problem of coalition building

% %\subsection{Data Quality metrics}
%\subsection{Coalition Heuristics}

\subsection{Metrics}\label{sec:metrics}

Data quality is a largely studied topic for the database management research communities,
Expand All @@ -29,7 +12,6 @@ \subsection{Metrics}\label{sec:metrics}


In the following we present a set of metrics to evaluate the quality of the data at each step of the big data pipeline.
We

The proposed metrics can be classified into two categories, namely quantitative and statistical.
Initially, these metrics are applied to the original dataset (X) without any transformations, and subsequently, they are applied to the transformed dataset (Y).
Expand All @@ -48,29 +30,13 @@ \subsubsection{Jaccard coefficient}
Unlike other similarity measures, such as Euclidean distance, Jaccard coefficient is not affected by the magnitude of the values in the dataset.
This property makes it suitable for datasets with categorical variables or nominal data, where the values do not have a meaningful numerical interpretation.

\subsubsection{Jaccard coefficent with weights} Let us consider two dataset X and Y of the same size.
The Jaccard coefficent is defined as:\[J(X,Y) = \frac{\sum_{i=1}^{n}w_i(x_i \cap y_i)}{\sum_{i=1}^{n}w_i(x_i \cup y_i)}\]
\subsubsection{Jaccard coefficient with weights} Let us consider two dataset X and Y of the same size.
The Jaccard coefficient is defined as:\[J(X,Y) = \frac{\sum_{i=1}^{n}w_i(x_i \cap y_i)}{\sum_{i=1}^{n}w_i(x_i \cup y_i)}\]
Which is computed by dividing the cardinality of the intersection of two sets by the cardinality of their union, weighted by the weights assigned to the elements in the sets.
Weights allow for the prioritization of certain features or elements in the datasets.
This approach can be particularly useful when some elements in the dataset have more importance or relevance than others.
By assigning weights to the elements, the weighted Jaccard similarity can account for this importance and provide a more accurate measure of similarity.

% \subsubsection{Kullback-Leibler divergence}
% Let us consider two dataset X and Y of the same size.
% The KL divergence is defined as:\[KL(X,Y) = \sum_{i=1}^{n}x_i \log \frac{x_i}{y_i}\]
% Which is computed by taking the sum of the product of each element in the first dataset and the logarithm of the ratio of the same element in the second dataset.
% The KL divergence is a measure of the difference between two probability distributions and is useful for comparing the dissimilarity of two datasets.


% \subsubsection{Kullback-Leibler divergence with weights} Let us consider two dataset X and Y of the same size. The weighted KL divergence is defined as:

% \[KL(X,Y) = \sum_{i=1}^{n}w_i(x_i \log \frac{x_i}{y_i})\]

% The weighted KL divergence is a variant of the KL divergence that incorporates weights to the elements in the datasets being compared.
% It allows for the prioritization of certain features or elements in the datasets.
% This approach can be particularly useful when some elements in the dataset have more importance or relevance than others.
% By assigning weights to the elements, the weighted KL divergence can account for this importance and provide a more accurate measure of dissimilarity.

\subsubsection{Jensen-Shannon Divergence}

Let us consider two datasets X and Y of the same size. The Jensen-Shannon divergence (JSD) is a symmetrized version of the KL divergence and can be used to measure the dissimilarity between the two probability distributions.
Expand Down
34 changes: 0 additions & 34 deletions policy.tex

This file was deleted.

Loading

0 comments on commit 88cb086

Please sign in to comment.