Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CHN address inspector #158

Merged
merged 5 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions sdgx/data_models/inspectors/personal.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,35 @@ def domain_verification(self, each_sample):
return True


class ChinaMainlandAddressInspector(RegexInspector):

# This regular expression does not take effect and is only for reference by developers.
# pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市)|[\u4e00-\u9fa5]{2,}(市|区|县|自治州|自治县|县级市|地区|盟|林区)?|[\u4e00-\u9fa5]{0,}(街道|镇|乡)?|[\u4e00-\u9fa5]{0,}(路|街|巷|弄)?|[\u4e00-\u9fa5]{0,}(号|弄)?$"

pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市|县|村|弄|乡|路|街)"

pii = True

data_type_name = "china_mainland_address"

_inspect_level = 30

address_min_length = 8

address_max_length = 30

def domain_verification(self, each_sample):
MooooCat marked this conversation as resolved.
Show resolved Hide resolved
# CHN address should be between 8 - 30 characters
if len(each_sample) < self.address_min_length:
return False
if len(each_sample) > self.address_max_length:
return False
# notice to distinguishing from the company name
if each_sample.endswith("公司"):
return False
return True


@hookimpl
def register(manager):
manager.register("EmailInspector", EmailInspector)
Expand All @@ -85,3 +114,5 @@ def register(manager):
manager.register("ChinaMainlandPostCode", ChinaMainlandPostCode)

manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode)

manager.register("ChinaMainlandAddressInspector", ChinaMainlandAddressInspector)
26 changes: 24 additions & 2 deletions tests/data_models/inspector/test_personal.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,17 @@
import pytest
from faker import Faker

fake = Faker(locale="zh_CN")

from sdgx.data_models.inspectors.personal import (
ChinaMainlandAddressInspector,
ChinaMainlandIDInspector,
ChinaMainlandMobilePhoneInspector,
ChinaMainlandPostCode,
ChinaMainlandUnifiedSocialCreditCode,
EmailInspector,
)

fake = Faker(locale="zh_CN")


def generate_uniform_credit_code():
# generate china mainland 统一社会信用代码 for test
Expand Down Expand Up @@ -194,5 +195,26 @@ def test_chn_uscc_inspector_generated_data(chn_personal_test_df: pd.DataFrame):
assert inspector_USCC.pii is True


# CHN address
def test_chn_address_inspector_demo_data(raw_data):
inspector_CHN_Address = ChinaMainlandAddressInspector()
inspector_CHN_Address.fit(raw_data)
assert not inspector_CHN_Address.regex_columns
assert sorted(inspector_CHN_Address.inspect()["china_mainland_address_columns"]) == sorted([])
assert inspector_CHN_Address.inspect_level == 30
assert inspector_CHN_Address.pii is True


def test_chn_address_inspector_generated_data(chn_personal_test_df: pd.DataFrame):
inspector_CHN_Address = ChinaMainlandAddressInspector()
inspector_CHN_Address.fit(chn_personal_test_df)
# assert inspector_CHN_Address.regex_columns
assert sorted(inspector_CHN_Address.inspect()["china_mainland_address_columns"]) == sorted(
["chn_address"]
)
assert inspector_CHN_Address.inspect_level == 30
assert inspector_CHN_Address.pii is True


if __name__ == "__main__":
pytest.main(["-vv", "-s", __file__])
Loading