diff --git a/sdgx/data_models/inspectors/personal.py b/sdgx/data_models/inspectors/personal.py index fb8d2454..3e925b5b 100644 --- a/sdgx/data_models/inspectors/personal.py +++ b/sdgx/data_models/inspectors/personal.py @@ -74,6 +74,35 @@ def domain_verification(self, each_sample): return True +class ChinaMainlandAddressInspector(RegexInspector): + + # This regular expression does not take effect and is only for reference by developers. + # pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市)|[\u4e00-\u9fa5]{2,}(市|区|县|自治州|自治县|县级市|地区|盟|林区)?|[\u4e00-\u9fa5]{0,}(街道|镇|乡)?|[\u4e00-\u9fa5]{0,}(路|街|巷|弄)?|[\u4e00-\u9fa5]{0,}(号|弄)?$" + + pattern = r"^[\u4e00-\u9fa5]{2,}(省|自治区|特别行政区|市|县|村|弄|乡|路|街)" + + pii = True + + data_type_name = "china_mainland_address" + + _inspect_level = 30 + + address_min_length = 8 + + address_max_length = 30 + + def domain_verification(self, each_sample): + # CHN address should be between 8 - 30 characters + if len(each_sample) < self.address_min_length: + return False + if len(each_sample) > self.address_max_length: + return False + # notice to distinguishing from the company name + if each_sample.endswith("公司"): + return False + return True + + @hookimpl def register(manager): manager.register("EmailInspector", EmailInspector) @@ -85,3 +114,5 @@ def register(manager): manager.register("ChinaMainlandPostCode", ChinaMainlandPostCode) manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode) + + manager.register("ChinaMainlandAddressInspector", ChinaMainlandAddressInspector) diff --git a/tests/data_models/inspector/test_personal.py b/tests/data_models/inspector/test_personal.py index c7d27490..9f417ac9 100644 --- a/tests/data_models/inspector/test_personal.py +++ b/tests/data_models/inspector/test_personal.py @@ -6,7 +6,10 @@ import pytest from faker import Faker +fake = Faker(locale="zh_CN") + from sdgx.data_models.inspectors.personal import ( + ChinaMainlandAddressInspector, ChinaMainlandIDInspector, ChinaMainlandMobilePhoneInspector, ChinaMainlandPostCode, @@ -14,8 +17,6 @@ EmailInspector, ) -fake = Faker(locale="zh_CN") - def generate_uniform_credit_code(): # generate china mainland 统一社会信用代码 for test @@ -194,5 +195,26 @@ def test_chn_uscc_inspector_generated_data(chn_personal_test_df: pd.DataFrame): assert inspector_USCC.pii is True +# CHN address +def test_chn_address_inspector_demo_data(raw_data): + inspector_CHN_Address = ChinaMainlandAddressInspector() + inspector_CHN_Address.fit(raw_data) + assert not inspector_CHN_Address.regex_columns + assert sorted(inspector_CHN_Address.inspect()["china_mainland_address_columns"]) == sorted([]) + assert inspector_CHN_Address.inspect_level == 30 + assert inspector_CHN_Address.pii is True + + +def test_chn_address_inspector_generated_data(chn_personal_test_df: pd.DataFrame): + inspector_CHN_Address = ChinaMainlandAddressInspector() + inspector_CHN_Address.fit(chn_personal_test_df) + # assert inspector_CHN_Address.regex_columns + assert sorted(inspector_CHN_Address.inspect()["china_mainland_address_columns"]) == sorted( + ["chn_address"] + ) + assert inspector_CHN_Address.inspect_level == 30 + assert inspector_CHN_Address.pii is True + + if __name__ == "__main__": pytest.main(["-vv", "-s", __file__])