diff --git a/examples/data_profiler_demo.ipynb b/examples/data_profiler_demo.ipynb new file mode 100644 index 000000000..0b95b6781 --- /dev/null +++ b/examples/data_profiler_demo.ipynb @@ -0,0 +1,765 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc2826d9", + "metadata": {}, + "source": [ + "# Data Profiler - What's in your data?" + ] + }, + { + "cell_type": "markdown", + "id": "b997522b", + "metadata": {}, + "source": [ + "The library is designed to easily detect sensitive data and gather statistics on your datasets with just a few lines of code.\n", + "\n", + "This demo covers the followings:\n", + "\n", + " - Basic usage of the Data Profiler\n", + " - The data reader class\n", + " - Updating and merging profiles\n", + " - Profile differences\n", + " - Graphing a profile\n", + " - Saving profiles\n", + " - Data labeling\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef404c84", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + " \n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "\n", + "# remove extra tf loggin\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "id": "f51971e3", + "metadata": {}, + "source": [ + "## Basic Usage of the Data Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "639e66d3", + "metadata": {}, + "source": [ + "This section shows the basic example of the Data Profiler. A CSV dataset is read using the data reader, then the Data object is given to the Data Profiler to detect sensitive data and obtain the statistics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "375eafc3-fa33-49ec-af7c-7d06644debb0", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# read, profile, and get the report in 3 lines\n", + "\n", + "# get the data\n", + "data = dp.Data(os.path.join(data_folder, \"csv/diamonds.csv\"))\n", + "\n", + "# profile the data\n", + "profile = dp.Profiler(data)\n", + "\n", + "# generate the report\n", + "report = profile.report(report_options={\"output_format\": \"compact\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2cd8345-2ebd-4455-8f95-d430808031d2", + "metadata": {}, + "outputs": [], + "source": [ + "data.head() # data.data provides access to a pandas.DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cc43b3b-3fcd-498c-89cd-ac646feb144a", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# print the report\n", + "print('\\nREPORT:\\n' + '='*80)\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "74027cfd", + "metadata": {}, + "source": [ + "## Data reader class -- Automatic Detection" + ] + }, + { + "cell_type": "markdown", + "id": "41364888", + "metadata": { + "tags": [] + }, + "source": [ + "Within the Data Profiler, there are 5 data reader classes:\n", + "\n", + " * CSVData (delimited data: CSV, TSV, etc.)\n", + " * JSONData\n", + " * ParquetData\n", + " * AVROData\n", + " * TextData" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "823829f4", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data with different file types\n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "csv_files = [\n", + " \"csv/aws_honeypot_marx_geo.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "json_files = [\n", + " \"json/complex_nested.json\",\n", + " \"json/honeypot_intentially_mislabeled_file.csv\", # json file with the .csv extension\n", + "]\n", + "parquet_files = [\n", + " \"parquet/nation.dict.parquet\",\n", + " \"parquet/nation.plain.intentionally_mislabled_file.csv\", # parquet file with the .csv extension\n", + "]\n", + "avro_files = [\n", + " \"avro/userdata1.avro\",\n", + " \"avro/userdata1_intentionally_mislabled_file.json\", # avro file with the .json extension\n", + "]\n", + "text_files = [\n", + " \"txt/discussion_reddit.txt\",\n", + "]\n", + "\n", + "all_files = csv_files + json_files + parquet_files + avro_files + text_files\n", + "\n", + "print('filepath' + ' ' * 58 + 'data type')\n", + "print('='*80)\n", + "for file in all_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " ############################\n", + " ##### READING THE DATA #####\n", + " data = dp.Data(filepath)\n", + " ############################\n", + " print(\"{:<65} {:<15}\".format(file, data.data_type))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47b0290a-9434-47cd-9fd5-2c4e9e91240a", + "metadata": {}, + "outputs": [], + "source": [ + "# importing from a url\n", + "data = dp.Data('https://raw.githubusercontent.com/capitalone/DataProfiler/main/dataprofiler/tests/data/csv/diamonds.csv')\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a6cede34-2734-466f-a1f6-afd2cc04a67c", + "metadata": {}, + "source": [ + "## Data Profiling\n", + "As we saw above, profiling is as simple as:\n", + "\n", + "```python\n", + "import dataprofiler as dp\n", + "\n", + "data = dp.Data('my_data.csv')\n", + "profiler = dp.Profiler(data)\n", + "report = profiler.report(report_options={\"output_format\": \"compact\"})\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "2c44a0c3-a5fa-4aba-891b-9cb4e7c2b204", + "metadata": {}, + "source": [ + "### Update profiles - the case for batching / streaming data¶" + ] + }, + { + "cell_type": "markdown", + "id": "965f8c85", + "metadata": {}, + "source": [ + "The profiler allows users to send the data to the profile in batches." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "deae4b9d-281f-4835-869c-df86309ddcb8", + "metadata": {}, + "outputs": [], + "source": [ + "# divide dataset in half\n", + "data = dp.Data(os.path.join(data_folder, \"csv/diamonds.csv\"))\n", + "df = data.data\n", + "df1 = df.iloc[:int(len(df)/2)]\n", + "df2 = df.iloc[int(len(df)/2):]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03b4705e-a95f-4551-aef2-af22bd62fd5f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Update the profile with the first half\n", + "profile = dp.Profiler(df1)\n", + "\n", + "############################\n", + "####### BATCH UPDATE #######\n", + "profile.update_profile(df2)\n", + "############################\n", + "report_batch = profile.report(report_options={\"output_format\": \"compact\"})\n", + "\n", + "# print('\\nREPORT:\\n' + '='*80)\n", + "print(json.dumps(report_batch, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "c547f051", + "metadata": {}, + "source": [ + "### Merge profiles -- the case for parallelization" + ] + }, + { + "cell_type": "markdown", + "id": "a5292962", + "metadata": {}, + "source": [ + "Two profiles can be added together to create a combined profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a565b8d1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# create two profiles and merge\n", + "profile1 = dp.Profiler(df1)\n", + "profile2 = dp.Profiler(df2)\n", + "profile_merge = profile1 + profile2\n", + "\n", + "# check results of the merged profile\n", + "report_merge = profile.report(report_options={\"output_format\": \"compact\"})\n", + "\n", + "# # print the report\n", + "# print('\\nREPORT:\\n' + '='*80)\n", + "# print(json.dumps(report_merge, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "f008bb48-32c7-4038-9096-978a9f74e545", + "metadata": {}, + "source": [ + "# Differences in Data\n", + "Can be appliied to both structured and unstructured datasets. \n", + "\n", + "Such reports can provide details on the differences between training and validation data like in this pseudo example:\n", + "```python\n", + "profiler_training = dp.Profiler(training_data)\n", + "profiler_testing = dp.Profiler(testing_data)\n", + "\n", + "validation_report = profiler_training.diff(profiler_testing)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0a6ff30-f015-45f0-a5a4-414c837c9029", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "# structured differences example\n", + "data_split_differences = profile1.diff(profile2)\n", + "pprint(data_split_differences)" + ] + }, + { + "cell_type": "markdown", + "id": "48baef5f-c57d-48cd-bf18-b34f566d35df", + "metadata": {}, + "source": [ + "## Graphing a Profile\n", + "\n", + "We've also added the ability to generating visual reports from a profile.\n", + "\n", + "The following plots are currently available to work directly with your profilers:\n", + "\n", + " * missing values matrix\n", + " * histogram (numeric columns only)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28b23fdb-026e-4394-ac37-bbd1e052c68c", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# get the data\n", + "data = dp.Data(os.path.join(data_folder, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "\n", + "# profile the data\n", + "profile = dp.Profiler(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac6fa42a-79eb-47a1-a058-f60b4de4e83d", + "metadata": {}, + "outputs": [], + "source": [ + "# generate a missing values matrix\n", + "fig = plt.figure(figsize=(8, 6), dpi=100)\n", + "fig = dp.graphs.plot_missing_values_matrix(profile, ax=fig.gca(), title=\"Missing Values Matrix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3402553-d2b8-4f73-85f3-45e739fb4144", + "metadata": {}, + "outputs": [], + "source": [ + "# generate histogram of all int/float columns\n", + "fig = dp.graphs.plot_histograms(profile)\n", + "fig.set_size_inches(8, 6)\n", + "fig.set_dpi(100)" + ] + }, + { + "cell_type": "markdown", + "id": "2335024f-a233-4e2b-aafd-dbf956562524", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile" + ] + }, + { + "cell_type": "markdown", + "id": "de3e0e30-33e3-4b18-8240-7e5c6029eb97", + "metadata": {}, + "source": [ + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffdb2126-cc11-49f1-aa5e-a533efc59f25", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "data = dp.Data(os.path.join(data_folder, \"csv/diamonds.csv\"))\n", + "\n", + "# Generate a profile\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Save a profile to disk for later (saves as pickle file)\n", + "profile.save(filepath=\"my_profile.pkl\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.pkl\")\n", + "\n", + "# Report the compact version of the profile\n", + "# report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "# print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "4787d4ff-8bd7-4c91-b197-dc875fb1d2d9", + "metadata": {}, + "source": [ + "# Unstructured Profiling\n", + "\n", + "Similar to structured datasets, text data can also be profiled with the unstructured profiler. \n", + "It currently provides an easy overview of information in the text such as:\n", + " * memory size\n", + " * char stats\n", + " * word stats\n", + " * data labeling entity stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f9dca8e-72fe-4f0f-ae8e-33e8974e4107", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "profiler_string = dp.Profiler(\"This is my random text: 332-23-2123\")\n", + "print(json.dumps(profiler_string.report(), indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc759400-13ff-4451-bbe5-645368d7aa75", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "email_data = [\"Message-ID: <11111111.1111111111111.JavaMail.evans@thyme>\\n\" + \\\n", + " \"Date: Fri, 10 Aug 2005 11:31:37 -0700 (PDT)\\n\" + \\\n", + " \"From: w..smith@company.com\\n\" + \\\n", + " \"To: john.smith@company.com\\n\" + \\\n", + " \"Subject: RE: ABC\\n\" + \\\n", + " \"Mime-Version: 1.0\\n\" + \\\n", + " \"Content-Type: text/plain; charset=us-ascii\\n\" + \\\n", + " \"Content-Transfer-Encoding: 7bit\\n\" + \\\n", + " \"X-From: Smith, Mary W. \\n\" + \\\n", + " \"X-To: Smith, John \\n\" + \\\n", + " \"X-cc: \\n\" + \\\n", + " \"X-bcc: \\n\" + \\\n", + " \"X-Folder: \\SSMITH (Non-Privileged)\\Sent Items\\n\" + \\\n", + " \"X-Origin: Smith-S\\n\" + \\\n", + " \"X-FileName: SSMITH (Non-Privileged).pst\\n\\n\" + \\\n", + " \"All I ever saw was the e-mail from the office.\\n\\n\" + \\\n", + " \"Mary\\n\\n\" + \\\n", + " \"-----Original Message-----\\n\" + \\\n", + " \"From: Smith, John \\n\" + \\\n", + " \"Sent: Friday, August 10, 2005 13:07 PM\\n\" + \\\n", + " \"To: Smith, Mary W.\\n\" + \\\n", + " \"Subject: ABC\\n\\n\" + \\\n", + " \"Have you heard any more regarding the ABC sale? I guess that means that \" + \\\n", + " \"it's no big deal here, but you think they would have send something.\\n\\n\\n\" + \\\n", + " \"John Smith\\n\" + \\\n", + " \"123-456-7890\\n\"]\n", + "\n", + "profiler_email = dp.Profiler(email_data, profiler_type='unstructured')\n", + "print(json.dumps(profiler_email.report(), indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "6ffa6d25-4d77-412a-b020-e34363d49786", + "metadata": {}, + "source": [ + "## Merging Unstructured Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2a2180e-c4d0-4f06-bc0f-a9b6bcbb0d47", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "merged_profile = profiler_string + profiler_email\n", + "print(json.dumps(merged_profile.report(), indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "f123f695-6bb5-467f-b90b-ea27fb9c6595", + "metadata": {}, + "source": [ + "## Differences in Unstructured Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0815362-5308-43fc-81de-4afea20118c2", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# unstructured differences example\n", + "validation_report = profiler_email.diff(profiler_string)\n", + "print(json.dumps(validation_report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "98435618-b157-40b3-92e7-3c8f42ea910f", + "metadata": {}, + "source": [ + "## Data Labeling" + ] + }, + { + "cell_type": "markdown", + "id": "b4cc0029-781d-4717-ad94-a26df0e9857f", + "metadata": {}, + "source": [ + "The Labeler is a pipeline designed to make building, training, and predictions with ML models quick and easy. There are 3 major components to the Labeler: the preprocessor, the model, and the postprocessor." + ] + }, + { + "cell_type": "markdown", + "id": "f013f36f-2b83-473d-88f9-090cde664b8a", + "metadata": {}, + "source": [ + "![alt text](DL-Flowchart.png \"Title\")" + ] + }, + { + "cell_type": "markdown", + "id": "d2689e4f-02c3-480d-a420-09bba1dccbe9", + "metadata": {}, + "source": [ + "Default labels:\n", + "* UNKNOWN\n", + "* ADDRESS\n", + "* BAN (bank account number, 10-18 digits)\n", + "* CREDIT_CARD\n", + "* EMAIL_ADDRESS\n", + "* UUID\n", + "* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.)\n", + "* IPV4\n", + "* IPV6\n", + "* MAC_ADDRESS\n", + "* PERSON\n", + "* PHONE_NUMBER\n", + "* SSN\n", + "* URL\n", + "* US_STATE\n", + "* DRIVERS_LICENSE\n", + "* DATE\n", + "* TIME\n", + "* DATETIME\n", + "* INTEGER\n", + "* FLOAT\n", + "* QUANTITY\n", + "* ORDINAL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a838d98-9c39-4539-8cff-ca862e6512bb", + "metadata": {}, + "outputs": [], + "source": [ + "# helper functions for printing results\n", + "\n", + "def get_structured_results(results):\n", + " \"\"\"Helper function to get data labels for each column.\"\"\"\n", + " columns = []\n", + " predictions = []\n", + " samples = []\n", + " for col in results['data_stats']:\n", + " columns.append(col['column_name'])\n", + " predictions.append(col['data_label'])\n", + " samples.append(col['samples'])\n", + "\n", + " df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions, 'Sample': samples})\n", + " return df_results\n", + "\n", + "def get_unstructured_results(data, results):\n", + " \"\"\"Helper function to get data labels for each labeled piece of text.\"\"\"\n", + " labeled_data = []\n", + " for pred in results['pred'][0]:\n", + " labeled_data.append([data[0][pred[0]:pred[1]], pred[2]])\n", + " label_df = pd.DataFrame(labeled_data, columns=['Text', 'Labels'])\n", + " return label_df\n", + " \n", + "\n", + "pd.set_option('display.width', 100)" + ] + }, + { + "cell_type": "markdown", + "id": "fb094133-0937-4218-b922-a515787d70a7", + "metadata": {}, + "source": [ + "### Structured Labeling\n", + "\n", + "Each column within your profile is given a suggested data label." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62fba6e3-3b2c-439b-bb56-59e977986286", + "metadata": {}, + "outputs": [], + "source": [ + "# profile data and get labels for each column\n", + "data = dp.Data(os.path.join(data_folder, \"csv/SchoolDataSmall.csv\"))\n", + "profiler = dp.Profiler(data)\n", + "report = profiler.report()\n", + "\n", + "\n", + "print('\\Label Predictions:\\n' + '=' * 85)\n", + "print(get_structured_results(report))" + ] + }, + { + "cell_type": "markdown", + "id": "2bb397e1-aa6d-48a5-8827-f802f04a0afc", + "metadata": {}, + "source": [ + "### Unstructured Labeling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f81ae0be-556e-4804-a49e-b358a3b341c9", + "metadata": {}, + "outputs": [], + "source": [ + "# load data\n", + "email_data = [\"Message-ID: <11111111.1111111111111.JavaMail.evans@thyme>\\n\" + \\\n", + " \"Date: Fri, 10 Aug 2005 11:31:37 -0700 (PDT)\\n\" + \\\n", + " \"From: w..smith@company.com\\n\" + \\\n", + " \"To: john.smith@company.com\\n\" + \\\n", + " \"Subject: RE: ABC\\n\" + \\\n", + " \"Mime-Version: 1.0\\n\" + \\\n", + " \"Content-Type: text/plain; charset=us-ascii\\n\" + \\\n", + " \"Content-Transfer-Encoding: 7bit\\n\" + \\\n", + " \"X-From: Smith, Mary W. \\n\" + \\\n", + " \"X-To: Smith, John \\n\" + \\\n", + " \"X-cc: \\n\" + \\\n", + " \"X-bcc: \\n\" + \\\n", + " \"X-Folder: \\SSMITH (Non-Privileged)\\Sent Items\\n\" + \\\n", + " \"X-Origin: Smith-S\\n\" + \\\n", + " \"X-FileName: SSMITH (Non-Privileged).pst\\n\\n\" + \\\n", + " \"All I ever saw was the e-mail from the office.\\n\\n\" + \\\n", + " \"Mary\\n\\n\" + \\\n", + " \"-----Original Message-----\\n\" + \\\n", + " \"From: Smith, John \\n\" + \\\n", + " \"Sent: Friday, August 10, 2005 13:07 PM\\n\" + \\\n", + " \"To: Smith, Mary W.\\n\" + \\\n", + " \"Subject: ABC\\n\\n\" + \\\n", + " \"Have you heard any more regarding the ABC sale? I guess that means that \" + \\\n", + " \"it's no big deal here, but you think they would have send something.\\n\\n\\n\" + \\\n", + " \"John Smith\\n\" + \\\n", + " \"123-456-7890\\n\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3713fbc5-872d-485c-bcb9-28ec552f531f", + "metadata": {}, + "outputs": [], + "source": [ + "labeler = dp.DataLabeler(labeler_type='unstructured')\n", + "\n", + "# convert prediction to word format and ner format\n", + "# Set the output to the NER format (start position, end position, label)\n", + "labeler.set_params(\n", + " { 'postprocessor': { 'output_format': 'ner', 'use_word_level_argmax': True } } \n", + ")\n", + "\n", + "# make predictions and get labels per character\n", + "predictions = labeler.predict(email_data)\n", + "\n", + "# display results\n", + "print('=========================Prediction========================')\n", + "print(get_unstructured_results(email_data, predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6643e98b-b056-4b9a-b415-0df3bd49aa44", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}