Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Noise Functionality to DataFrame Columns in Utils Package #72

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import unittest

import pandas as pd

from tests import d1_path
from valentine.data_sources.utils import get_encoding, get_delimiter, is_date
from valentine.data_sources.utils import get_encoding, get_delimiter, is_date, add_noise_to_df_column
from valentine.utils.utils import is_sorted, convert_data_type


Expand Down Expand Up @@ -30,3 +32,12 @@ def test_get_delimiter(self):
def test_is_date(self):
date_str = "2019-04-26 18:03:50.941332"
assert is_date(date_str)

def test_add_noise_to_df_column(self):
# Tiny chance that this test will fail due to the random nature of the noise
test_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']})
assert_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']})
assert add_noise_to_df_column(test_df, 'a', 0.0)['a'].equals(assert_df['a'])
assert add_noise_to_df_column(test_df, 'b', 0.0)['b'].equals(assert_df['b'])
assert not add_noise_to_df_column(test_df, 'a', 0.99999)['a'].equals(assert_df['a'])
assert not add_noise_to_df_column(test_df, 'b', 0.99999)['b'].equals(assert_df['b'])
28 changes: 27 additions & 1 deletion valentine/data_sources/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import chardet
import csv

import chardet
import numpy as np
from dateutil.parser import parse


Expand Down Expand Up @@ -40,3 +42,27 @@ def is_date(string, fuzzy=False):
return True
except Exception:
return False


def add_noise_to_df_column(df, column_name, noise_level):
"""
Adds noise to a specified column in a DataFrame.

Parameters:
- df (pd.DataFrame): The DataFrame containing the column to which noise will be added.
- column_name (str): The name of the column to which noise will be added.
- noise_level (float): The level of noise to be added. For numerical columns, this indicates the standard deviation
of the Gaussian noise. For string columns, it represents the probability of permuting the
characters of each string.

Returns:
- pd.DataFrame: The DataFrame with noise added to the specified column.
"""
if df[column_name].dtype in ["int64", "float64"]:
noise = np.random.normal(0, noise_level, df[column_name].shape[0])
df[column_name] = df[column_name] + noise
elif df[column_name].dtype == "object":
for _ in range(df[column_name].shape[0]):
if np.random.rand() < noise_level:
df[column_name] = df[column_name].apply(lambda x: ''.join(np.random.permutation(list(str(x)))))
return df
Loading