Skip to content

Commit

Permalink
Add CHN address inspector (#158)
Browse files Browse the repository at this point in the history
* add ChinaMainlandAddressInspector

* add test cases

* [pre-commit.ci] auto fixes from pre-commit.com hooks

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
MooooCat and pre-commit-ci[bot] committed Mar 11, 2024
1 parent ddd9756 commit 52741fd
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 2 deletions.
31 changes: 31 additions & 0 deletions sdgx/data_models/inspectors/personal.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,35 @@ def domain_verification(self, each_sample):
return True


class ChinaMainlandAddressInspector(RegexInspector):

# This regular expression does not take effect and is only for reference by developers.
# pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市)|[\u4e00-\u9fa5]{2,}(市|区|县|自治州|自治县|县级市|地区|盟|林区)?|[\u4e00-\u9fa5]{0,}(街道|镇|乡)?|[\u4e00-\u9fa5]{0,}(路|街|巷|弄)?|[\u4e00-\u9fa5]{0,}(号|弄)?$"

pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市|县|村|弄|乡|路|街)"

pii = True

data_type_name = "china_mainland_address"

_inspect_level = 30

address_min_length = 8

address_max_length = 30

def domain_verification(self, each_sample):
# CHN address should be between 8 - 30 characters
if len(each_sample) < self.address_min_length:
return False
if len(each_sample) > self.address_max_length:
return False
# notice to distinguishing from the company name
if each_sample.endswith("公司"):
return False
return True


@hookimpl
def register(manager):
manager.register("EmailInspector", EmailInspector)
Expand All @@ -85,3 +114,5 @@ def register(manager):
manager.register("ChinaMainlandPostCode", ChinaMainlandPostCode)

manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode)

manager.register("ChinaMainlandAddressInspector", ChinaMainlandAddressInspector)
26 changes: 24 additions & 2 deletions tests/data_models/inspector/test_personal.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,17 @@
import pytest
from faker import Faker

fake = Faker(locale="zh_CN")

from sdgx.data_models.inspectors.personal import (
ChinaMainlandAddressInspector,
ChinaMainlandIDInspector,
ChinaMainlandMobilePhoneInspector,
ChinaMainlandPostCode,
ChinaMainlandUnifiedSocialCreditCode,
EmailInspector,
)

fake = Faker(locale="zh_CN")


def generate_uniform_credit_code():
# generate china mainland 统一社会信用代码 for test
Expand Down Expand Up @@ -194,5 +195,26 @@ def test_chn_uscc_inspector_generated_data(chn_personal_test_df: pd.DataFrame):
assert inspector_USCC.pii is True


# CHN address
def test_chn_address_inspector_demo_data(raw_data):
inspector_CHN_Address = ChinaMainlandAddressInspector()
inspector_CHN_Address.fit(raw_data)
assert not inspector_CHN_Address.regex_columns
assert sorted(inspector_CHN_Address.inspect()["china_mainland_address_columns"]) == sorted([])
assert inspector_CHN_Address.inspect_level == 30
assert inspector_CHN_Address.pii is True


def test_chn_address_inspector_generated_data(chn_personal_test_df: pd.DataFrame):
inspector_CHN_Address = ChinaMainlandAddressInspector()
inspector_CHN_Address.fit(chn_personal_test_df)
# assert inspector_CHN_Address.regex_columns
assert sorted(inspector_CHN_Address.inspect()["china_mainland_address_columns"]) == sorted(
["chn_address"]
)
assert inspector_CHN_Address.inspect_level == 30
assert inspector_CHN_Address.pii is True


if __name__ == "__main__":
pytest.main(["-vv", "-s", __file__])

0 comments on commit 52741fd

Please sign in to comment.