Skip to content

Commit

Permalink
Feature: Add Email Generator (a new type of sdgx.data_processor) (#184)
Browse files Browse the repository at this point in the history
* add comment

* Create email generator

* Create testcase

* complete email generator testcase

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add annotations from __future__

* register into DataProcessorManager

In the future, we need to introduce processor_level to control the loading order of data_processor instead of hard coding it.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add missing imports

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix faker localize typo

* use email_validator for email generator testcase

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix fit method in EmailGenerator

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
MooooCat and pre-commit-ci[bot] authored Jun 21, 2024
1 parent 1ef4ca2 commit 14ad5e8
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ classifiers = [
]

[project.optional-dependencies]
test = ["pytest", "pytest-cov", "coverage<7"]
test = ["pytest", "pytest-cov", "coverage<7", "email_validator"]
docs = [
"Sphinx",
"pydata-sphinx-theme",
Expand Down
65 changes: 65 additions & 0 deletions sdgx/data_processors/generators/email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from __future__ import annotations

from typing import Any

import pandas as pd
from faker import Faker

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.generators.pii import PIIGenerator

fake = Faker()


class EmailGenerator(PIIGenerator):
"""
A class for generating and reversing the conversion of email addresses in a pd.DataFrame.
This class is a subclass of `PIIGenerator` and is designed to handle the conversion and
reversal of email addresses in a pd.DataFrame. It uses the `email_columns_list` to identify
which columns in the pd.DataFrame contain email addresses.
Attributes:
email_columns_list (list): A list of column names in the pd.DataFrame that contain email addresses.
Methods:
fit(metadata: Metadata | None = None): Fits the generator to the metadata.
convert(raw_data: pd.DataFrame) -> pd.DataFrame: Converts the email addresses in the pd.DataFrame.
reverse_convert(processed_data: pd.DataFrame) -> pd.DataFrame: Reverses the conversion of the email addresses in the pd.DataFrame.
"""

email_columns_list: list = []

fitted: bool = False

def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):

self.email_columns_list = list(metadata.get("email_columns"))

self.fitted = True

def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame:
if not self.email_columns_list:
return raw_data
processed_data = raw_data
# remove every email column from the dataframe
for each_col in self.email_columns_list:
processed_data = self.remove_columns(processed_data, each_col)
return processed_data

def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame:
if not self.email_columns_list:
return processed_data
df_length = processed_data.shape[0]
for each_col_name in self.email_columns_list:
each_email_col = [fake.ascii_company_email() for _ in range(df_length)]
each_email_df = pd.DataFrame({each_col_name: each_email_col})
processed_data = self.attach_columns(processed_data, each_email_df)

return processed_data


@hookimpl
def register(manager):
manager.register("EmailGenerator", EmailGenerator)
9 changes: 8 additions & 1 deletion sdgx/data_processors/generators/pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,11 @@


class PIIGenerator(Generator):
pass
"""
The PIIGenerator class is a subclass of the Generator class. It is designed to generate PII (Personally Identifiable Information) objects.
This class is responsible for:
- providing batch generation methods for different types of PII objects (columns).
- providing randomised generation methods for different types of PII objects.
- generating PII columns with constraints such as geography, attribution, etc.
"""
3 changes: 2 additions & 1 deletion sdgx/data_processors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ class DataProcessorManager(Manager):
"""

preset_defalut_processors = [
p.lower() for p in ["IntValueFormatter", "DatetimeFormatter", "NonValueTransformer"]
p.lower()
for p in ["IntValueFormatter", "DatetimeFormatter", "NonValueTransformer", "EmailGenerator"]
] + ["ColumnOrderTransformer".lower()]
"""
preset_defalut_processors list stores the lowercase names of the transformers loaded by default. When using the synthesizer, they will be loaded by default to facilitate user operations.
Expand Down
105 changes: 105 additions & 0 deletions tests/data_processors/generators/test_email_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from __future__ import annotations

import datetime
import random
import re

import pandas as pd
import pytest
from faker import Faker
from pydantic import BaseModel, EmailStr

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.generators.email import EmailGenerator

fake = Faker(locale="zh_CN")
fake_en = Faker(["en_US"])


@pytest.fixture
def chn_personal_test_df():
row_cnt = 1000
today = datetime.datetime.today()
X = []
header = [
"ssn_sfz",
"chn_name",
"eng_name",
"gender",
"birth_date",
"age",
"email",
"mobile_phone_no",
"chn_address",
"postcode",
"job",
"company_name",
]
for _ in range(row_cnt):
each_gender = random.choice(["male", "female"])
if each_gender == "male":
each_name = fake.last_name() + fake.name_male()
else:
each_name = fake.last_name() + fake.name_female()
each_eng_name = fake_en.name()
each_birth_date = fake.date()
each_age = today.year - int(each_birth_date[:4])
each_email = fake.email()
each_phone = fake.phone_number()
each_sfz = fake.ssn()
each_address = fake.address()
each_job = fake.job()
each_corp = fake.company()
each_postcode = fake.postcode()

each_x = [
each_sfz,
each_name,
each_eng_name,
each_gender,
each_birth_date,
each_age,
each_email,
each_phone,
each_address,
each_postcode,
each_job,
each_corp,
]

X.append(each_x)

yield pd.DataFrame(X, columns=header)


class EmailCheckModel(BaseModel):
email: EmailStr


def test_email_generator(chn_personal_test_df: pd.DataFrame):

assert "email" in chn_personal_test_df.columns
# get metadata
metadata_df = Metadata.from_dataframe(chn_personal_test_df)

# generator
email_generator = EmailGenerator()
assert not email_generator.fitted
email_generator.fit(metadata_df)
assert email_generator.fitted
assert email_generator.email_columns_list == ["email"]

converted_df = email_generator.convert(chn_personal_test_df)
assert len(converted_df) == len(chn_personal_test_df)
assert converted_df.shape[1] != chn_personal_test_df.shape[1]
assert converted_df.shape[1] == chn_personal_test_df.shape[1] - len(
email_generator.email_columns_list
)
assert "email" not in converted_df.columns

reverse_converted_df = email_generator.reverse_convert(converted_df)
assert len(reverse_converted_df) == len(chn_personal_test_df)
assert "email" in reverse_converted_df.columns
# each generated value is email
for each_value in chn_personal_test_df["email"].values:
assert EmailCheckModel(email=each_value)

0 comments on commit 14ad5e8

Please sign in to comment.