-
Notifications
You must be signed in to change notification settings - Fork 545
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature: Add Email Generator (a new type of sdgx.data_processor) (#184)
* add comment * Create email generator * Create testcase * complete email generator testcase * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add annotations from __future__ * register into DataProcessorManager In the future, we need to introduce processor_level to control the loading order of data_processor instead of hard coding it. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add missing imports * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix faker localize typo * use email_validator for email generator testcase * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix fit method in EmailGenerator * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
1ef4ca2
commit 14ad5e8
Showing
5 changed files
with
181 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
|
||
import pandas as pd | ||
from faker import Faker | ||
|
||
from sdgx.data_models.metadata import Metadata | ||
from sdgx.data_processors.extension import hookimpl | ||
from sdgx.data_processors.generators.pii import PIIGenerator | ||
|
||
fake = Faker() | ||
|
||
|
||
class EmailGenerator(PIIGenerator): | ||
""" | ||
A class for generating and reversing the conversion of email addresses in a pd.DataFrame. | ||
This class is a subclass of `PIIGenerator` and is designed to handle the conversion and | ||
reversal of email addresses in a pd.DataFrame. It uses the `email_columns_list` to identify | ||
which columns in the pd.DataFrame contain email addresses. | ||
Attributes: | ||
email_columns_list (list): A list of column names in the pd.DataFrame that contain email addresses. | ||
Methods: | ||
fit(metadata: Metadata | None = None): Fits the generator to the metadata. | ||
convert(raw_data: pd.DataFrame) -> pd.DataFrame: Converts the email addresses in the pd.DataFrame. | ||
reverse_convert(processed_data: pd.DataFrame) -> pd.DataFrame: Reverses the conversion of the email addresses in the pd.DataFrame. | ||
""" | ||
|
||
email_columns_list: list = [] | ||
|
||
fitted: bool = False | ||
|
||
def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): | ||
|
||
self.email_columns_list = list(metadata.get("email_columns")) | ||
|
||
self.fitted = True | ||
|
||
def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: | ||
if not self.email_columns_list: | ||
return raw_data | ||
processed_data = raw_data | ||
# remove every email column from the dataframe | ||
for each_col in self.email_columns_list: | ||
processed_data = self.remove_columns(processed_data, each_col) | ||
return processed_data | ||
|
||
def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: | ||
if not self.email_columns_list: | ||
return processed_data | ||
df_length = processed_data.shape[0] | ||
for each_col_name in self.email_columns_list: | ||
each_email_col = [fake.ascii_company_email() for _ in range(df_length)] | ||
each_email_df = pd.DataFrame({each_col_name: each_email_col}) | ||
processed_data = self.attach_columns(processed_data, each_email_df) | ||
|
||
return processed_data | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("EmailGenerator", EmailGenerator) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
105 changes: 105 additions & 0 deletions
105
tests/data_processors/generators/test_email_generator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
from __future__ import annotations | ||
|
||
import datetime | ||
import random | ||
import re | ||
|
||
import pandas as pd | ||
import pytest | ||
from faker import Faker | ||
from pydantic import BaseModel, EmailStr | ||
|
||
from sdgx.data_models.metadata import Metadata | ||
from sdgx.data_processors.generators.email import EmailGenerator | ||
|
||
fake = Faker(locale="zh_CN") | ||
fake_en = Faker(["en_US"]) | ||
|
||
|
||
@pytest.fixture | ||
def chn_personal_test_df(): | ||
row_cnt = 1000 | ||
today = datetime.datetime.today() | ||
X = [] | ||
header = [ | ||
"ssn_sfz", | ||
"chn_name", | ||
"eng_name", | ||
"gender", | ||
"birth_date", | ||
"age", | ||
"email", | ||
"mobile_phone_no", | ||
"chn_address", | ||
"postcode", | ||
"job", | ||
"company_name", | ||
] | ||
for _ in range(row_cnt): | ||
each_gender = random.choice(["male", "female"]) | ||
if each_gender == "male": | ||
each_name = fake.last_name() + fake.name_male() | ||
else: | ||
each_name = fake.last_name() + fake.name_female() | ||
each_eng_name = fake_en.name() | ||
each_birth_date = fake.date() | ||
each_age = today.year - int(each_birth_date[:4]) | ||
each_email = fake.email() | ||
each_phone = fake.phone_number() | ||
each_sfz = fake.ssn() | ||
each_address = fake.address() | ||
each_job = fake.job() | ||
each_corp = fake.company() | ||
each_postcode = fake.postcode() | ||
|
||
each_x = [ | ||
each_sfz, | ||
each_name, | ||
each_eng_name, | ||
each_gender, | ||
each_birth_date, | ||
each_age, | ||
each_email, | ||
each_phone, | ||
each_address, | ||
each_postcode, | ||
each_job, | ||
each_corp, | ||
] | ||
|
||
X.append(each_x) | ||
|
||
yield pd.DataFrame(X, columns=header) | ||
|
||
|
||
class EmailCheckModel(BaseModel): | ||
email: EmailStr | ||
|
||
|
||
def test_email_generator(chn_personal_test_df: pd.DataFrame): | ||
|
||
assert "email" in chn_personal_test_df.columns | ||
# get metadata | ||
metadata_df = Metadata.from_dataframe(chn_personal_test_df) | ||
|
||
# generator | ||
email_generator = EmailGenerator() | ||
assert not email_generator.fitted | ||
email_generator.fit(metadata_df) | ||
assert email_generator.fitted | ||
assert email_generator.email_columns_list == ["email"] | ||
|
||
converted_df = email_generator.convert(chn_personal_test_df) | ||
assert len(converted_df) == len(chn_personal_test_df) | ||
assert converted_df.shape[1] != chn_personal_test_df.shape[1] | ||
assert converted_df.shape[1] == chn_personal_test_df.shape[1] - len( | ||
email_generator.email_columns_list | ||
) | ||
assert "email" not in converted_df.columns | ||
|
||
reverse_converted_df = email_generator.reverse_convert(converted_df) | ||
assert len(reverse_converted_df) == len(chn_personal_test_df) | ||
assert "email" in reverse_converted_df.columns | ||
# each generated value is email | ||
for each_value in chn_personal_test_df["email"].values: | ||
assert EmailCheckModel(email=each_value) |