diff --git a/UseCases.md b/UseCases.md new file mode 100644 index 0000000..9a67023 --- /dev/null +++ b/UseCases.md @@ -0,0 +1,106 @@ +This file describes exact behavior of methods for different edge cases and +explains general logic. This description covers the behavior of get_tld, +get_tld_unsafe, get_sld, get_sld_unsafe, split_domain, split_domain_unsafe + +Unsafe versions of the methods will significantly save resources on large-scale +applications of the library where the data has already been converted to +lowercase and missing data has a None value. This can be done in Spark/Dask, +for example, and result in a significant reduction in computational resources. +For adhoc usage, the original functions are sufficient. + +1. general difference of get_*() and get_*_unsafe() methods: +get_*_unsafe() does not perform if the input string is None and does not +transforms it to the lower case. + +2. The listed above methods works only with non-canonical FQDN strings - +trailing dot must be removed before call the method. This restriction allows +get rid of fuzzy logic in edge cases. + +3. DNS does not support empty labels - if some label detected to be empty, +None will be returned. + +4. Every method processes provided FQDN in the reverse order, from the last +label towards the start of the string. It stops when the specific task is +completed. Therefore no validation occurs outside of this scope. For example, +``` +get_tld('......com') -> 'com' +``` +as leading dots are not processed. +split_domain method is based on get_sld method - it returns everything in +front of get_sld() as a prefix. +Specifically to example above +``` +split_domain('......com') -> ('....',None,'com') +``` +Edge cases and expected behavior +The behavior of the library can be illustrated best on the small examples: +(boolean arguments are omitted if does not affect behavior ) + +## get_tld() +###Degenerate case (empty list) + +| input | strict | wildcard | result | notes | +|--------|---------|----------|--------|-------| +| '' | | | None | empty labels are not allowed | +| '.' | | | None | empty labels are not allowed | +| '..' | | | None | empty labels are not allowed | +| '....' | | | None | empty labels are not allowed | +| 'abc' | false | | 'abc' | non-strict mode, the last label is TLD | +| 'abc' | true | | None | 'abc' not in the list | +| '.abc' | false | | 'abc' | non-strict mode, the last label is TLD | +| '.abc' | true | | None | 'abc' not in the list | +| 'abc.' | | | None | empty labels are not allowed | +| '....abc' | false | | 'abc' | non-strict mode, string head is not processed| +| '....abc' | true | | None | 'abc' not in the list | +| 'example.abc' | false | | 'abc' | non-strict mode, the last label is TLD | +| 'example.abc' | true | | None | 'abc' not in the list | + +###Simple case, no wildcards (['com']) + +| input | strict | wildcard | result | notes | +|--------|---------|----------|--------|-------| +| '' | | | None | empty labels are not allowed | +| '.' | | | None | empty labels are not allowed | +| '..' | | | None | empty labels are not allowed | +| '....' | | | None | empty labels are not allowed | +| 'abc' | false | | 'abc' | non-strict mode | +| 'abc' | true | | None | not in the list | +| 'com' | | | 'com' | allowed TLD | +| '.abc' | false | | 'abc' | non-strict mode | +| '.abc' | true | | None | not in the list | +| '.com' | | | 'com' | allowed TLD | +| 'abc.' | | | None | empty labels are not allowed | +| '....abc' | false | | 'abc' | non-strict mode, string head is not processed| +| '....abc' | true | | None | not in the list | +| '....com' | | | 'com' | allowed TLD, string head is not processed| +| 'example.abc' | false | | 'abc' | non-strict mode, the last label is TLD | +| 'example.abc' | true | | None | 'abc' not in the list | +| 'example.com' | | | 'com' | allowed TDL | + +### Simple case, negation, no wildcards (['com', '!org']) + +| input | strict | wildcard | result | notes | +|--------|---------|----------|--------|-------| +| '' | | | None | empty labels are not allowed | +| '.' | | | None | empty labels are not allowed | +| '..' | | | None | empty labels are not allowed | +| '....' | | | None | empty labels are not allowed | +| 'abc' | false | | 'abc' | non-strict mode | +| 'abc' | true | | None | not in the list | +| 'com' | | | 'com' | allowed TLD | +| 'org' | | | None | not allowed TLD | +| '.abc' | false | | 'abc' | non-strict mode | +| '.abc' | true | | None | not in the list | +| '.com' | | | 'com' | allowed TLD | +| '.org' | | | None | not allowed TLD | +| 'abc.' | | | None | empty labels are not allowed | +| 'com.' | | | None | empty labels are not allowed | +| 'org.' | | | None | empty labels are not allowed | +| '....abc' | false | | 'abc' | non-strict mode, string head is not processed| +| '....abc' | true | | None | not in the list | +| '....com' | | | 'com' | allowed TLD, string head is not processed| +| '....org' | | | None | not allowed TLD| +| 'example.abc' | false | | 'abc' | non-strict mode, the last label is TLD | +| 'example.abc' | true | | None | 'abc' not in the list | +| 'example.com' | | | 'com' | allowed TDL | +| 'example.org' | | | None | not allowed TDL | diff --git a/src/publicsuffix2/__init__.py b/src/publicsuffix2/__init__.py index 6c1d088..14dba6b 100644 --- a/src/publicsuffix2/__init__.py +++ b/src/publicsuffix2/__init__.py @@ -258,21 +258,44 @@ def get_sld(self, domain, wildcard=True, strict=False): :param strict: boolean, check the TLD is valid, return None if not :return: string, the SLD for the domain """ - if not domain: + if not domain or len(domain) == 0: return None + domain = domain.lower() + return self.get_sld_unsafe(domain, wildcard, strict) - # for compatibility, set strict True not to allow invalid TLDs - tld = self.get_tld(domain, wildcard, True) - if strict and tld is None: - return None + def get_sld_unsafe(self, domain, wildcard=True, strict=False): + """ + Return the second-level-domain (SLD) or private suffix of a given domain + according to the public suffix list. The public suffix list includes + wildcards, so if wildcard is set to True, this will follow the wildcard + on traversal, otherwise it will stop at wildcard nodes. + + The logic does not check by default whether the TLD is in the Trie, so + for example, 'www.this.local' will return 'this.local'. If you want to + ensure the TLD is in the public suffix list, use strict=True. + + If domain is already an eTLD, it returns domain as-is instead of None + value. + + In difference from get_sld method this method does not perform validation + of the input string, transformation it to the lowercase or trimming of + taildot. - parts = domain.lower().strip('.').split('.') - num_of_tld_parts = 0 if tld is None else tld.count('.') + 1 + :param domain: string, needs to match the encoding of the PSL (idna or UTF8) + :param wildcard: boolean, follow wildcard patterns + :param strict: boolean, check the TLD is valid, return None if not + :return: string, the SLD for the domain - if len(parts) <= num_of_tld_parts: + """ + tld = self.get_tld_unsafe(domain, wildcard, strict) + if tld is None: + return None + rest = len(domain) - len(tld) + if rest == 0: return tld else: - return '.'.join(parts[-(num_of_tld_parts + 1):]) + sld_idx = domain.rfind('.', 0, rest - 1) + 1 # will be rest + 1 iff empty label + return domain[sld_idx:] if rest - sld_idx > 1 else None def get_public_suffix(self, domain, wildcard=True, strict=False): """ @@ -297,31 +320,208 @@ def get_tld(self, domain, wildcard=True, strict=False): :param strict: boolean, check that top TLD is valid in Trie :return: string, the TLD for the domain """ - if not domain: - return None - parts = domain.lower().strip('.').split('.') - hits = [None] * len(parts) - if strict and ( - self.root in (0, 1) or parts[-1] not in self.root[1].keys() - ): + if domain is None: return None + domain = domain.lower() + + return self.get_tld_unsafe(domain, wildcard, strict) + + def get_tld_unsafe(self, domain, wildcard=True, strict=False): + """ + Return the TLD, or public suffix, of a domain using the public suffix + list. uses wildcards if set, and checks for valid top TLD is + strict=True. The input domain should not have root '.' in the end. + + This will return the domain itself when it is an ICANN TLD, e.g., 'com' + returns 'com', for follow on processing, while 'co.uk' return 'uk'. On + the other hand, more complicated domains will return their public + suffix, e.g., 'google.co.uk' will return 'co.uk'. + + Empty labels are not allowed: + + '.' -> . -> None - self._lookup_node(hits, 1, self.root, parts, wildcard) + 'com.' -> . -> None - for i, what in enumerate(hits): - if what is not None and what == 0: - return '.'.join(parts[i:]) + '.com' -> . -> 'com' + + In difference from get_sld method this method does not perform validation + of the input string, transformation it to the lowercase or trimming of + taildot. + + :param domain: string the domain which TLD should be matched, without trailing '.' + :param wildcard: boolean, follow wildcards in Trie + :param strict: boolean, check that top TLD is valid in Trie + :return: string, the TLD for the domain + """ + lbl_end = -1 + lbl_start = len(domain) + + tld_start = None + root = self.root + if root == 0: # exhausted case - empty root. Use last label as TLD if not strict + lbl_start = None if strict else domain.rfind('.') + + while type(root) is tuple: + if root[0] == 0: + tld_start = lbl_start + if lbl_start == -1: + break + lbl_end = lbl_start + lbl_start = domain.rfind('.', 0, lbl_end) + if len(domain[lbl_start + 1:lbl_end]) == 0: + break + p1 = root[1].get(domain[lbl_start + 1:lbl_end]) + root = root[1].get("*") if p1 is None and wildcard else p1 + + if root == 0: + tld_start = lbl_start + # elif root == 1: # we already have tld_start point to previous label + # tld_start = lbl_end + elif root is None: # only last label + if strict or tld_start is not None and tld_start != len(domain): + tld_start = lbl_end + else: + tld_start = lbl_start + + tld = domain[tld_start + 1:] if tld_start is not None else None + return tld or None # empty string -> None + + def get_components(self, domain: str, wildcard=True, strict=False) -> (str, str, str): + """ + Returns 3-tuple of components of the domain name: (prefix, SLL, TLD/eTLD) + where + * TLD/ETLD is top-level domain (extended top-level domain) per publicsuffix + * SLL - second level (registrable domain) label (the label on immediately + on the left of TLD/eTLD. None if only TLD is present. + * prefix - all the labels on the left side of TLD/eTLD. None if not present + This method does not validate the conformity of prefix to DNS requirements. + Note: this function as well as the appropriate method of PublicSuffixList + class is crafted for use in bulk-processors (such as pandas), therefore it + always returns 3-tuple: + + + psl = ps2.PublicSuffixList(idna=True) + df['prefix']['sll']['tld'] = zip(*df.domain.apply(ps2.get_component) + df = df.dropna(subset=['prefix','sll','tld']) + + + Examples of domain decomposition: + '.' -> (None, None, None) + 'com.' -> (None, None, None) + 'com' -> (None, None, 'com') + 'google.com' -> (None, 'google', 'com') + 'www.google.com' -> ('www', 'google', 'com') + 'mail.l.google.com' -> ('mail.l', 'google', 'com') + 'mail.l..com' -> ('mail.l', 'None', 'com') - invalid case - empty label + '.......com' -> ('.....', 'None', 'com') - invalid case - empty labels + + Optionally read, and parse a public suffix list. `psl_file` is either a file + location string, or a file-like object, or an iterable of lines from a + public suffix data file. + + If psl_file is None, the vendored file named "public_suffix_list.dat" is + loaded. It is stored side by side with this Python package. + + The file format is described at http://publicsuffix.org/ + + :param domain: string - domain name without trailing '.' + :param wildcard: bool - whether wildcard rules are supported + :param strict: bool - disable unknown TLDs + :return: 3-tuple, (prefix, SLL, TLD/eTLD) + """ + if not domain or len(domain) == 0: + return None, None, None + domain = domain.lower() + return self.get_components_unsafe(domain, wildcard, strict) + + def get_components_unsafe(self, domain: str, wildcard=True, strict=False) -> (str, str, str): + """ + This is unsafe method that does not checks if the domain is None. Also it does + not perform conversion of the domain into lowercase. + """ + + tld = self.get_tld_unsafe(domain, wildcard, strict) + sld = None + prefix = None + + if tld is not None: + sld_end_idx = len(domain) - len(tld) - 1 + if sld_end_idx > 0: + idx = domain.rfind('.', 0, sld_end_idx) + prefix = domain[:idx] if idx > 0 else None + sld = domain[idx + 1:sld_end_idx] + sld = None if len(sld) == 0 else sld + return prefix, sld, tld _PSL = None +def get_components(domain, psl_file=None, wildcard=True, idna=True, strict=False): + """ + Returns 3-tuple of components of the domain name: (prefix, SLL, TLD/eTLD) + where + * TLD/ETLD is top-level domain (extended top-level domain) per publicsuffix + * SLL - second level (registrable domain) label (the label on immediately + on the left of TLD/eTLD. None if only TLD is present. + * prefix - all the labels on the left side of TLD/eTLD. None if not present + This method does not validate the conformity of prefix to DNS requirements. + Note: this function as well as the appropriate method of PublicSuffixList + class is crafted for use in bulk-processors (such as pandas), therefore it + always returns 3-tuple: + + ``` + df['prefix']['sll']['tld'] = zip(*df.domain.apply(get_component, idna=True) + df = df.dropna(subset=['prefix','sll','tld']) + ``` + + Examples of domain decomposition: + '.' -> (None, None, None) + 'com.' -> (None, None, None) + 'com' -> (None, None, 'com') + 'google.com' -> (None, 'google', 'com') + 'www.google.com' -> ('www', 'google', 'com') + 'mail.l.google.com' -> ('mail.l', 'google', 'com') + 'mail.l..com' -> ('mail.l', 'None', 'com') - invalid case - empty label + '.......com' -> ('.....', 'None', 'com') - invalid case - empty labels + + Optionally read, and parse a public suffix list. `psl_file` is either a file + location string, or a file-like object, or an iterable of lines from a + public suffix data file. + + If psl_file is None, the vendored file named "public_suffix_list.dat" is + loaded. It is stored side by side with this Python package. + + The file format is described at http://publicsuffix.org/ + + Convenience function that builds and caches a PublicSuffixList object. + NOTE: this function caches the first set of parameters thar were used. If we + have two subsequent calls: + ``` + split_domain(domain, idna=False) + split_domain(domain, idna=True) + ``` + the second call will use the same non-idna publicsuffix as the first one. + Use with caution. + + :param psl_file: the file name, if not available built in is used + :param idna: only idna part of the public suffix is used + :param domain: string - domain name without trailing '.' + :param wildcard: bool - whether wildcard rules are supported + :param strict: bool - disable unknown TLDs + :return: 3-tuple, (prefix, SLL, TLD/eTLD) + """ + global _PSL + _PSL = _PSL or PublicSuffixList(psl_file, idna=idna) + return _PSL.get_components(domain, wildcard=wildcard, strict=strict) + + def get_sld(domain, psl_file=None, wildcard=True, idna=True, strict=False): """ Return the private suffix or SLD for a `domain` DNS name string. The original publicsuffix2 library used the method get_public_suffix() for this - purpose, but get_private_suffix() is more proper. Convenience function that - builds and caches a PublicSuffixList object. + purpose, but get_private_suffix() is more proper. Optionally read, and parse a public suffix list. `psl_file` is either a file location string, or a file-like object, or an iterable of lines from a @@ -331,6 +531,24 @@ def get_sld(domain, psl_file=None, wildcard=True, idna=True, strict=False): loaded. It is stored side by side with this Python package. The file format is described at http://publicsuffix.org/ + + Convenience function that builds and caches a PublicSuffixList object. + NOTE: this function caches the first set of parameters thar were used. If we + have two subsequent calls: + ``` + get_sld(domain, idna=False) + get_sld(domain, idna=True) + ``` + the second call will use the same non-idna publicsuffix as the first one. + Use with caution. + + :param psl_file: the file name, if not available built in is used + :param idna: only idna part of the public suffix is used + :param domain: string - domain name without trailing '.' + :param wildcard: bool - whether wildcard rules are supported + :param strict: bool - disable unknown TLDs + :return: second-level (registrable) domain that is TLD/eTLD + one label on the left + if only TLD is found it is returned. None if empty label or invalid input """ global _PSL _PSL = _PSL or PublicSuffixList(psl_file, idna=idna) @@ -340,8 +558,7 @@ def get_sld(domain, psl_file=None, wildcard=True, idna=True, strict=False): def get_tld(domain, psl_file=None, wildcard=True, idna=True, strict=False): """ Return the TLD or public suffix for a `domain` DNS name string. (this is - actually the private suffix that is returned) Convenience function that - builds and caches a PublicSuffixList object. + actually the private suffix that is returned) Optionally read, and parse a public suffix list. `psl_file` is either a file location string, or a file-like object, or an iterable of lines from a @@ -351,6 +568,23 @@ def get_tld(domain, psl_file=None, wildcard=True, idna=True, strict=False): loaded. It is stored side by side with this Python package. The file format is described at http://publicsuffix.org/ + + Convenience function that builds and caches a PublicSuffixList object. + NOTE: this function caches the first set of parameters thar were used. If we + have two subsequent calls: + ``` + get_tld(domain, idna=False) + get_tld(domain, idna=True) + ``` + the second call will use the same non-idna publicsuffix as the first one. + Use with caution. + + :param psl_file: the file name, if not available built in is used + :param idna: only idna part of the public suffix is used + :param domain: string - domain name without trailing '.' + :param wildcard: bool - whether wildcard rules are supported + :param strict: bool - disable unknown TLDs + :return: TLD/eTLD or None """ global _PSL _PSL = _PSL or PublicSuffixList(psl_file, idna=idna) @@ -361,8 +595,7 @@ def get_public_suffix(domain, psl_file=None, wildcard=True, idna=True, strict=Fa """ Included for compatibility with the original publicsuffix2 library -- this function returns the private suffix or SLD of the domain. To get the public - suffix, use get_tld(). Convenience function that builds and caches a - PublicSuffixList object. + suffix, use get_tld(). Optionally read, and parse a public suffix list. `psl_file` is either a file location string, or a file-like object, or an iterable of lines from a diff --git a/tests.py b/tests.py index 39fc60d..f65b25a 100644 --- a/tests.py +++ b/tests.py @@ -49,8 +49,11 @@ def test_get_sld_from_empty_list(self): psl = publicsuffix.PublicSuffixList([]) assert 'com' == psl.get_sld('com') assert 'com' == psl.get_sld('COM') - assert 'com' == psl.get_sld('.com') - assert 'com' == psl.get_sld('a.example.com') + # '.com' -> . -> None, empty labels are not allowed + assert None == psl.get_sld('.com') + # 'a.example.com', strict=False -> .. -> 'example.com' + assert 'example.com' == psl.get_sld('a.example.com') + def test_get_sld_from_empty_list_in_strict_mode(self): psl = publicsuffix.PublicSuffixList([]) @@ -78,7 +81,8 @@ def test_get_sld_from_list_with_exception_rule(self): def test_get_sld_from_list_with_fqdn(self): psl = publicsuffix.PublicSuffixList(['com']) - assert 'example.com' == psl.get_sld('example.com.') + # 'example.com.' -> .. -> None, empty labels are not allowed + assert None == psl.get_sld('example.com.') def test_get_sld_from_list_with_unicode(self): psl = publicsuffix.PublicSuffixList([u'\u0440\u0444'], idna=False) @@ -103,24 +107,29 @@ def test_get_sld_from_builtin_full_publicsuffix_org_list_with_mixed_case(self): def test_get_sld_from_builtin_full_publicsuffix_org_list_with_leading_dot(self): psl = publicsuffix.PublicSuffixList(None) - assert 'com' == psl.get_sld('.com') - assert 'example' == psl.get_sld('.example') + # '.com' -> . -> None, empty labels are not allowed + assert None == psl.get_sld('.com') + # '.example' -> . -> None, empty labels are not allowed + assert None == psl.get_sld('.example') assert 'example.com' == psl.get_sld('.example.com') - assert 'example' == psl.get_sld('.example.example') + # note: non-strict mode: TLD 'example' -> SLD example.example + assert 'example.example' == psl.get_sld('.example.example') def test_get_sld_from_builtin_full_publicsuffix_org_list_with_unlisted_tld(self): psl = publicsuffix.PublicSuffixList(None) assert 'example' == psl.get_sld('example') - assert 'example' == psl.get_sld('example.example') - assert 'example' == psl.get_sld('b.example.example') - assert 'example' == psl.get_sld('a.b.example.example') + # non-strict mode, tld=example, sld=example.example + assert 'example.example' == psl.get_sld('example.example') + assert 'example.example' == psl.get_sld('b.example.example') + assert 'example.example' == psl.get_sld('a.b.example.example') def test_get_sld_from_builtin_full_publicsuffix_org_list_with_listed_ut_non_internet_tld(self): psl = publicsuffix.PublicSuffixList(None) assert 'local' == psl.get_sld('local') - assert 'local' == psl.get_sld('example.local') - assert 'local' == psl.get_sld('b.example.local') - assert 'local' == psl.get_sld('a.b.example.local') + # note: non-strict mode, unknown tld: local -> sld: example.local + assert 'example.local' == psl.get_sld('example.local') + assert 'example.local' == psl.get_sld('b.example.local') + assert 'example.local' == psl.get_sld('a.b.example.local') def test_get_sld_from_builtin_full_publicsuffix_org_list_with_one_rule(self): psl = publicsuffix.PublicSuffixList(None) @@ -237,16 +246,19 @@ def test_get_tld_returns_correct_tld_or_etld(self): assert 'co.uk' == psl.get_tld('co.uk', wildcard=True) assert 'co.uk' == psl.get_tld('co.uk', wildcard=False) assert None == psl.get_tld('blah.local', strict=True) - assert None == psl.get_tld('blah.local', wildcard=False) + # non-strict mode: TLD = 'local' + assert 'local' == psl.get_tld('blah.local', wildcard=False) assert 'local' == psl.get_tld('blah.local') def test_get_tld_returns_correct_tld_or_etld_for_fqdn(self): psl = publicsuffix.PublicSuffixList() - assert 'com' == psl.get_tld('www.foo.com.') + # note: empty label or dot on the right side is not allowed + assert None == psl.get_tld('www.foo.com.') def test_get_tld_returns_correct_tld_or_etld_for_root_domain(self): psl = publicsuffix.PublicSuffixList() - assert '' == psl.get_tld('.') + # Note: empty label is not allowed + assert None == psl.get_tld('.') def test_get_tld_returns_correct_tld_or_etld_for_empty_string(self): psl = publicsuffix.PublicSuffixList() @@ -270,11 +282,12 @@ def test_get_sld_backward_compatibility(self): def test_get_sld_backward_compatibility_strict_and_wildcard_flags(self): psl = publicsuffix.PublicSuffixList() assert 'local' == psl.get_sld('local') - assert 'local' == psl.get_sld('foo.local') + # non-strict mode: TLD=local -> sld = foo.local + assert 'foo.local' == psl.get_sld('foo.local') assert None == psl.get_sld('local', strict=True) assert None == psl.get_sld('foo.local', strict=True) assert 'local' == psl.get_sld('local', wildcard=False) - assert 'local' == psl.get_sld('foo.local', strict=False) + assert 'foo.local' == psl.get_sld('foo.local', strict=False) def test_get_sld_backward_compatibility_sld_for_empty_string(self): psl = publicsuffix.PublicSuffixList() @@ -284,13 +297,15 @@ def test_get_sld_backward_compatibility_sld_for_empty_string(self): def test_get_sld_backward_compatibility_sld_for_fqdn(self): psl = publicsuffix.PublicSuffixList() - assert 'foo.com' == psl.get_sld('www.foo.com.') + # 'www.foo.com.' -> ... -> None, empty labels are not allowed + assert None == psl.get_sld('www.foo.com.') def test_get_sld_backward_compatibility_sld_for_root_domain(self): psl = publicsuffix.PublicSuffixList() - assert '' == psl.get_sld('.') + # empty labels are not allowed + assert None == psl.get_sld('.') assert None == psl.get_sld('.', strict=True) - assert '' == psl.get_sld('.', wildcard=False) + assert None == psl.get_sld('.', wildcard=False) if __name__ == '__main__': diff --git a/tests_mozilla.py b/tests_mozilla.py index 351e9f5..216116e 100644 --- a/tests_mozilla.py +++ b/tests_mozilla.py @@ -583,40 +583,49 @@ def test_get_sld_Mixed_case3(self): assert 'example.com' == publicsuffix.get_sld('WwW.example.COM') def test_get_sld_Leading_dot1(self): - assert 'com' == publicsuffix.get_sld('.com') + # '.com' -> . -> None, empty labels are not allowed + assert None == publicsuffix.get_sld('.com') def test_get_sld_Leading_dot2(self): - assert 'example' == publicsuffix.get_sld('.example') + # '.example' -> . -> None, empty labels are not allowed + assert None == publicsuffix.get_sld('.example') def test_get_sld_Leading_dot3(self): assert 'example.com' == publicsuffix.get_sld('.example.com') def test_get_sld_Leading_dot4(self): - assert 'example' == publicsuffix.get_sld('.example.example') + # non-strict: TLD:example, SLD:example.example + assert 'example.example' == publicsuffix.get_sld('.example.example') def test_get_sld_Unlisted_sld1(self): assert 'example' == publicsuffix.get_sld('example') def test_get_sld_Unlisted_sld2(self): - assert 'example' == publicsuffix.get_sld('example.example') + # non-strict: TLD:example, SLD:example.example + assert 'example.example' == publicsuffix.get_sld('example.example') def test_get_sld_Unlisted_sld3(self): - assert 'example' == publicsuffix.get_sld('b.example.example') + # non-strict: TLD:example, SLD:example.example + assert 'example.example' == publicsuffix.get_sld('b.example.example') def test_get_sld_Unlisted_sld4(self): - assert 'example' == publicsuffix.get_sld('a.b.example.example') + # non-strict mode: TLD=example -> SLD=example.example + assert 'example.example' == publicsuffix.get_sld('a.b.example.example') def test_get_sld_Listed_but_non_Internet_sld1(self): assert 'local' == publicsuffix.get_sld('local') def test_get_sld_Listed_but_non_Internet_sld2(self): - assert 'local' == publicsuffix.get_sld('example.local') + # non-strict: TLD:local, SLD:example.local + assert 'example.local' == publicsuffix.get_sld('example.local') def test_get_sld_Listed_but_non_Internet_sld3(self): - assert 'local' == publicsuffix.get_sld('b.example.local') + # non-strict: TLD:local, SLD:example.local + assert 'example.local' == publicsuffix.get_sld('b.example.local') def test_get_sld_Listed_but_non_Internet_sld4(self): - assert 'local' == publicsuffix.get_sld('a.b.example.local') + # non-strict: TLD:local, SLD:example.local + assert 'example.local' == publicsuffix.get_sld('a.b.example.local') def test_get_sld_tld_with_only_1_rule1(self): assert 'biz' == publicsuffix.get_sld('biz') diff --git a/tests_scenarios.py b/tests_scenarios.py new file mode 100644 index 0000000..5cc0676 --- /dev/null +++ b/tests_scenarios.py @@ -0,0 +1,744 @@ +import unittest +import publicsuffix2 as ps2 + + +class TestPSScenarios(unittest.TestCase): + + def setUp(self): + self.psl = None + + def validate_tld_empty_label(self): + self.assertIsNone(self.psl.get_tld(None)) + self.assertIsNone(self.psl.get_tld('')) + self.assertIsNone(self.psl.get_tld('', strict=True)) + self.assertIsNone(self.psl.get_tld('', wildcard=False)) + self.assertIsNone(self.psl.get_tld('', strict=True, wildcard=False)) + self.assertIsNone(self.psl.get_tld('.')) + self.assertIsNone(self.psl.get_tld('.', strict=True)) + self.assertIsNone(self.psl.get_tld('.', wildcard=False)) + self.assertIsNone(self.psl.get_tld('.', strict=True, wildcard=False)) + self.assertIsNone(self.psl.get_tld('..')) + self.assertIsNone(self.psl.get_tld('..', strict=True)) + self.assertIsNone(self.psl.get_tld('..', wildcard=False)) + self.assertIsNone(self.psl.get_tld('..', strict=True, wildcard=False)) + self.assertIsNone(self.psl.get_tld('....')) + self.assertIsNone(self.psl.get_tld('....', strict=True)) + self.assertIsNone(self.psl.get_tld('....', wildcard=False)) + self.assertIsNone(self.psl.get_tld('....', strict=True, wildcard=False)) + + def validate_sld_empty_label(self): + self.assertIsNone(self.psl.get_sld(None)) + self.assertIsNone(self.psl.get_sld('')) + self.assertIsNone(self.psl.get_sld('', strict=True)) + self.assertIsNone(self.psl.get_sld('', wildcard=False)) + self.assertIsNone(self.psl.get_sld('', strict=True, wildcard=False)) + self.assertIsNone(self.psl.get_sld('.')) + self.assertIsNone(self.psl.get_sld('.', strict=True)) + self.assertIsNone(self.psl.get_sld('.', wildcard=False)) + self.assertIsNone(self.psl.get_sld('.', strict=True, wildcard=False)) + self.assertIsNone(self.psl.get_sld('..')) + self.assertIsNone(self.psl.get_sld('..', strict=True)) + self.assertIsNone(self.psl.get_sld('..', wildcard=False)) + self.assertIsNone(self.psl.get_sld('..', strict=True, wildcard=False)) + self.assertIsNone(self.psl.get_sld('....')) + self.assertIsNone(self.psl.get_sld('....', strict=True)) + self.assertIsNone(self.psl.get_sld('....', wildcard=False)) + self.assertIsNone(self.psl.get_sld('....', strict=True, wildcard=False)) + + def validate_components_empty_label(self): + self.assertEqual((None, None, None), self.psl.get_components(None)) + self.assertEqual((None, None, None), self.psl.get_components('')) + self.assertEqual((None, None, None), self.psl.get_components('', strict=True)) + self.assertEqual((None, None, None), self.psl.get_components('', wildcard=False)) + self.assertEqual((None, None, None), self.psl.get_components('', strict=True, wildcard=False)) + self.assertEqual((None, None, None), self.psl.get_components('.')) + self.assertEqual((None, None, None), self.psl.get_components('.', strict=True)) + self.assertEqual((None, None, None), self.psl.get_components('.', wildcard=False)) + self.assertEqual((None, None, None), self.psl.get_components('.', strict=True, wildcard=False)) + self.assertEqual((None, None, None), self.psl.get_components('..')) + self.assertEqual((None, None, None), self.psl.get_components('..', strict=True)) + self.assertEqual((None, None, None), self.psl.get_components('..', wildcard=False)) + self.assertEqual((None, None, None), self.psl.get_components('..', strict=True, wildcard=False)) + self.assertEqual((None, None, None), self.psl.get_components('....')) + self.assertEqual((None, None, None), self.psl.get_components('....', strict=True)) + self.assertEqual((None, None, None), self.psl.get_components('....', wildcard=False)) + self.assertEqual((None, None, None), self.psl.get_components('....', strict=True, wildcard=False)) + + def validate_tld(self, tld: str, expected: tuple): + # tld-only case + self.assertEqual(expected[0], self.psl.get_tld(tld)) + self.assertEqual(expected[1], self.psl.get_tld(tld, strict=True)) + self.assertEqual(expected[2], self.psl.get_tld(tld, wildcard=False)) + self.assertEqual(expected[3], self.psl.get_tld(tld, strict=True, wildcard=False)) + + # empty SLD label + self.assertEqual(expected[0], self.psl.get_tld('.' + tld)) + self.assertEqual(expected[1], self.psl.get_tld('.' + tld, strict=True)) + self.assertEqual(expected[2], self.psl.get_tld('.' + tld, wildcard=False)) + self.assertEqual(expected[3], self.psl.get_tld('.' + tld, strict=True, wildcard=False)) + + # empty SLD label, multiple dots + self.assertEqual(expected[0], self.psl.get_tld('....' + tld)) + self.assertEqual(expected[1], self.psl.get_tld('....' + tld, strict=True)) + self.assertEqual(expected[2], self.psl.get_tld('....' + tld, wildcard=False)) + self.assertEqual(expected[3], self.psl.get_tld('....' + tld, strict=True, wildcard=False)) + + # . case + self.assertEqual(expected[0], self.psl.get_tld('example.' + tld)) + self.assertEqual(expected[1], self.psl.get_tld('example.' + tld, strict=True)) + self.assertEqual(expected[2], self.psl.get_tld('example.' + tld, wildcard=False)) + self.assertEqual(expected[3], self.psl.get_tld('example.' + tld, strict=True, wildcard=False)) + + # .. case -> empty SLL + prefix + self.assertEqual(expected[0], self.psl.get_tld('example..' + tld)) + self.assertEqual(expected[1], self.psl.get_tld('example..' + tld, strict=True)) + self.assertEqual(expected[2], self.psl.get_tld('example..' + tld, wildcard=False)) + self.assertEqual(expected[3], self.psl.get_tld('example..' + tld, strict=True, wildcard=False)) + + # .. case + self.assertEqual(expected[0], self.psl.get_tld('www.example.' + tld)) + self.assertEqual(expected[1], self.psl.get_tld('www.example.' + tld, strict=True)) + self.assertEqual(expected[2], self.psl.get_tld('www.example.' + tld, wildcard=False)) + self.assertEqual(expected[3], self.psl.get_tld('www.example.' + tld, strict=True, wildcard=False)) + + def validate_sld(self, domain: str, expected: tuple): + self.assertEqual(expected[0], self.psl.get_sld(domain)) + self.assertEqual(expected[1], self.psl.get_sld(domain, strict=True)) + self.assertEqual(expected[2], self.psl.get_sld(domain, wildcard=False)) + self.assertEqual(expected[3], self.psl.get_sld(domain, strict=True, wildcard=False)) + + def validate_components(self, domain: str, expected: tuple): + self.assertEqual(expected[0], self.psl.get_components(domain)) + self.assertEqual(expected[1], self.psl.get_components(domain, strict=True)) + self.assertEqual(expected[2], self.psl.get_components(domain, wildcard=False)) + self.assertEqual(expected[3], self.psl.get_components(domain, strict=True, wildcard=False)) + + def test_01a_unknown_tld(self): + self.psl = ps2.PublicSuffixList([]) + + self.validate_tld_empty_label() + + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='aBc', expected=('abc', None, 'abc', None)) + + def test_01b_unknown_tld(self): + self.psl = ps2.PublicSuffixList([]) + + self.validate_sld_empty_label() + + self.validate_sld(domain='abc', expected=('abc', None, 'abc', None)) + self.validate_sld(domain='.abc', expected=(None, None, None, None)) + self.validate_sld(domain='abc.', expected=(None, None, None, None)) + self.validate_sld(domain='example..abc', expected=(None, None, None, None)) + self.validate_sld(domain='example.abc', expected=('example.abc', None, 'example.abc', None)) + self.validate_sld(domain='.exAMPle.abc', expected=('example.abc', None, 'example.abc', None)) + self.validate_sld(domain='www.example.abc', expected=('example.abc', None, 'example.abc', None)) + self.validate_sld(domain='www.en.example.abc', expected=('example.abc', None, 'example.abc', None)) + + def test_01c_unknown_tld(self): + self.psl = ps2.PublicSuffixList([]) + + self.validate_components_empty_label() + + self.validate_components(domain='abc', + expected=((None, None, 'abc'), (None, None, None), + (None, None, 'abc'), (None, None, None))) + self.validate_components(domain='.abc', + expected=((None, None, 'abc'), (None, None, None), + (None, None, 'abc'), (None, None, None))) + self.validate_components(domain='abc.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.abc', + expected=((None, 'example', 'abc'), (None, None, None), + (None, 'example', 'abc'), (None, None, None))) + self.validate_components(domain='example..abc', + expected=(('example', None, 'abc'), (None, None, None), + ('example', None, 'abc'), (None, None, None))) + self.validate_components(domain='.example.abc', + expected=((None, 'example', 'abc'), (None, None, None), + (None, 'example', 'abc'), (None, None, None))) + self.validate_components(domain='.example.abc.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.abc', + expected=(('www', 'example', 'abc'), (None, None, None), + ('www', 'example', 'abc'), (None, None, None))) + self.validate_components(domain='www.en.example.abc', + expected=(('www.en', 'example', 'abc'), (None, None, None), + ('www.en', 'example', 'abc'), (None, None, None))) + + def test_02a_known_tld(self): + self.psl = ps2.PublicSuffixList(['com']) + + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='co.uk', expected=('uk', None, 'uk', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='cOm', expected=('com', 'com', 'com', 'com')) + + def test_02b_known_tld(self): + self.psl = ps2.PublicSuffixList(['com']) + + self.validate_sld(domain='com', expected=('com', 'com', 'com', 'com')) + self.validate_sld(domain='.com', expected=(None, None, None, None)) + self.validate_sld(domain='com.', expected=(None, None, None, None)) + self.validate_sld(domain='example..com', expected=(None, None, None, None)) + self.validate_sld(domain='example.com.', expected=(None, None, None, None)) + self.validate_sld(domain='example.com', + expected=('example.com', 'example.com', 'example.com', 'example.com')) + self.validate_sld(domain='.exAMPle.com', + expected=('example.com', 'example.com', 'example.com', 'example.com')) + self.validate_sld(domain='www.example.com', + expected=('example.com', 'example.com', 'example.com', 'example.com')) + self.validate_sld(domain='www.en.example.com', + expected=('example.com', 'example.com', 'example.com', 'example.com')) + + def test_02c_known_tld(self): + self.psl = ps2.PublicSuffixList(['com']) + + self.validate_components(domain='com', + expected=((None, None, 'com'), (None, None, 'com'), + (None, None, 'com'), (None, None, 'com'))) + self.validate_components(domain='.com', + expected=((None, None, 'com'), (None, None, 'com'), + (None, None, 'com'), (None, None, 'com'))) + self.validate_components(domain='com.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.com', + expected=((None, 'example', 'com'), (None, 'example', 'com'), + (None, 'example', 'com'), (None, 'example', 'com'))) + self.validate_components(domain='example..com', + expected=(('example', None, 'com'), ('example', None, 'com'), + ('example', None, 'com'), ('example', None, 'com'))) + self.validate_components(domain='.example.com', + expected=((None, 'example', 'com'), (None, 'example', 'com'), + (None, 'example', 'com'), (None, 'example', 'com'))) + self.validate_components(domain='.example.com.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.com', + expected=(('www', 'example', 'com'), ('www', 'example', 'com'), + ('www', 'example', 'com'), ('www', 'example', 'com'))) + self.validate_components(domain='www.en.example.com', + expected=(('www.en', 'example', 'com'), ('www.en', 'example', 'com'), + ('www.en', 'example', 'com'), ('www.en', 'example', 'com'))) + + def test_03a_negated_tld(self): + self.psl = ps2.PublicSuffixList(['com', '!org']) + + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='org', expected=(None, None, None, None)) + self.validate_tld(tld='oRg', expected=(None, None, None, None)) + + def test_03b_negated_tld(self): + self.psl = ps2.PublicSuffixList(['com', '!org']) + + # validate the behavior with negated TLD + self.validate_sld(domain='org', expected=(None, None, None, None)) + self.validate_sld(domain='.org', expected=(None, None, None, None)) + self.validate_sld(domain='org.', expected=(None, None, None, None)) + self.validate_sld(domain='example..org', expected=(None, None, None, None)) + self.validate_sld(domain='example.org.', expected=(None, None, None, None)) + self.validate_sld(domain='example.org', expected=(None, None, None, None)) + self.validate_sld(domain='.exAMPle.org', expected=(None, None, None, None)) + self.validate_sld(domain='www.example.org', expected=(None, None, None, None)) + self.validate_sld(domain='www.en.example.org', expected=(None, None, None, None)) + + def test_03c_negated_tld(self): + self.psl = ps2.PublicSuffixList(['!org']) + + self.validate_components(domain='org', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='.org', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='org.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.org', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example..org', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='.example.org', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='.example.org.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.org', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.en.example.org', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + + def test_04a_known_etld(self): + self.psl = ps2.PublicSuffixList(['com', 'co.uk']) + + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='uk', expected=('uk', 'uk', 'uk', 'uk')) + self.validate_tld(tld='co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + + def test_04b_known_etld(self): + self.psl = ps2.PublicSuffixList(['co.uk']) + + # TLD of eTLD is automatically registered + self.validate_sld(domain='uk', expected=('uk', 'uk', 'uk', 'uk')) + self.validate_sld(domain='.uk', expected=(None, None, None, None)) + self.validate_sld(domain='uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example..uk', expected=(None, None, None, None)) + self.validate_sld(domain='example.uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example.uk', + expected=('example.uk', 'example.uk', 'example.uk', 'example.uk')) + self.validate_sld(domain='.exAMPle.uk', + expected=('example.uk', 'example.uk', 'example.uk', 'example.uk')) + self.validate_sld(domain='www.example.uk', + expected=('example.uk', 'example.uk', 'example.uk', 'example.uk')) + self.validate_sld(domain='www.en.example.uk', + expected=('example.uk', 'example.uk', 'example.uk', 'example.uk')) + + # validate ETLD + self.validate_sld(domain='co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='.co.uk', expected=(None, None, None, None)) + self.validate_sld(domain='co.uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example..co.uk', expected=(None, None, None, None)) + self.validate_sld(domain='example.co.uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example.co.uk', + expected=('example.co.uk', 'example.co.uk', 'example.co.uk', 'example.co.uk')) + self.validate_sld(domain='.exAMPle.co.uk', + expected=('example.co.uk', 'example.co.uk', 'example.co.uk', 'example.co.uk')) + self.validate_sld(domain='www.example.co.uk', + expected=('example.co.uk', 'example.co.uk', 'example.co.uk', 'example.co.uk')) + self.validate_sld(domain='www.en.example.co.uk', + expected=('example.co.uk', 'example.co.uk', 'example.co.uk', 'example.co.uk')) + + def test_04c_known_etld(self): + self.psl = ps2.PublicSuffixList(['co.uk']) + + # TLD of eTLD is automatically registered + self.validate_components(domain='uk', + expected=((None, None, 'uk'), (None, None, 'uk'), + (None, None, 'uk'), (None, None, 'uk'))) + self.validate_components(domain='.uk', + expected=((None, None, 'uk'), (None, None, 'uk'), + (None, None, 'uk'), (None, None, 'uk'))) + self.validate_components(domain='uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.uk', + expected=((None, 'example', 'uk'), (None, 'example', 'uk'), + (None, 'example', 'uk'), (None, 'example', 'uk'))) + self.validate_components(domain='example..uk', + expected=(('example', None, 'uk'), ('example', None, 'uk'), + ('example', None, 'uk'), ('example', None, 'uk'))) + self.validate_components(domain='.example.uk', + expected=((None, 'example', 'uk'), (None, 'example', 'uk'), + (None, 'example', 'uk'), (None, 'example', 'uk'))) + self.validate_components(domain='.example.uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.uk', + expected=(('www', 'example', 'uk'), ('www', 'example', 'uk'), + ('www', 'example', 'uk'), ('www', 'example', 'uk'))) + self.validate_components(domain='www.en.example.uk', + expected=(('www.en', 'example', 'uk'), ('www.en', 'example', 'uk'), + ('www.en', 'example', 'uk'), ('www.en', 'example', 'uk'))) + + # Validate ETLD + self.validate_components(domain='co.uk', + expected=((None, None, 'co.uk'), (None, None, 'co.uk'), + (None, None, 'co.uk'), (None, None, 'co.uk'))) + self.validate_components(domain='.co.uk', + expected=((None, None, 'co.uk'), (None, None, 'co.uk'), + (None, None, 'co.uk'), (None, None, 'co.uk'))) + self.validate_components(domain='co.uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.co.uk', + expected=((None, 'example', 'co.uk'), (None, 'example', 'co.uk'), + (None, 'example', 'co.uk'), (None, 'example', 'co.uk'))) + self.validate_components(domain='example..co.uk', + expected=(('example', None, 'co.uk'), ('example', None, 'co.uk'), + ('example', None, 'co.uk'), ('example', None, 'co.uk'))) + self.validate_components(domain='.example.co.uk', + expected=((None, 'example', 'co.uk'), (None, 'example', 'co.uk'), + (None, 'example', 'co.uk'), (None, 'example', 'co.uk'))) + self.validate_components(domain='.example.com.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.co.uk', + expected=(('www', 'example', 'co.uk'), ('www', 'example', 'co.uk'), + ('www', 'example', 'co.uk'), ('www', 'example', 'co.uk'))) + self.validate_components(domain='www.en.example.co.uk', + expected=(('www.en', 'example', 'co.uk'), ('www.en', 'example', 'co.uk'), + ('www.en', 'example', 'co.uk'), ('www.en', 'example', 'co.uk'))) + + def test_05_simple_list_etld_with_negated_tld(self): + self.psl = ps2.PublicSuffixList(['com', 'co.uk', '!org']) + + self.validate_tld_empty_label() + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='uk', expected=('uk', 'uk', 'uk', 'uk')) + self.validate_tld(tld='co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_tld(tld='org', expected=(None, None, None, None)) + + def test_06_simple_list_etld_with_negated_etld(self): + self.psl = ps2.PublicSuffixList(['com', 'gov.uk', '!org', '!co.uk']) + + self.validate_tld_empty_label() + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='uk', expected=('uk', 'uk', 'uk', 'uk')) + self.validate_tld(tld='gov.uk', expected=('gov.uk', 'gov.uk', 'gov.uk', 'gov.uk')) + self.validate_tld(tld='org', expected=(None, None, None, None)) + self.validate_tld(tld='co.uk', expected=('uk', 'uk', 'uk', 'uk')) + + def test_07a_wildcard(self): + self.psl = ps2.PublicSuffixList(['*']) + self.validate_tld_empty_label() + self.validate_tld(tld='abc', expected=('abc', 'abc', 'abc', None)) + self.validate_tld(tld='aBc', expected=('abc', 'abc', 'abc', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', None)) + self.validate_tld(tld='cOm', expected=('com', 'com', 'com', None)) + + def test_07b_wildcard(self): + self.psl = ps2.PublicSuffixList(['*']) + + self.validate_sld(domain='com', expected=('com', 'com', 'com', None)) + self.validate_sld(domain='.com', expected=(None, None, None, None)) + self.validate_sld(domain='com.', expected=(None, None, None, None)) + self.validate_sld(domain='example..com', expected=(None, None, None, None)) + self.validate_sld(domain='example.com.', expected=(None, None, None, None)) + self.validate_sld(domain='example.com', + expected=('example.com', 'example.com', 'example.com', None)) + self.validate_sld(domain='.exAMPle.com', + expected=('example.com', 'example.com', 'example.com', None)) + self.validate_sld(domain='www.example.com', + expected=('example.com', 'example.com', 'example.com', None)) + self.validate_sld(domain='www.en.example.com', + expected=('example.com', 'example.com', 'example.com', None)) + + def test_07c_wildcard(self): + self.psl = ps2.PublicSuffixList(['*']) + + self.validate_components(domain='com', + expected=((None, None, 'com'), (None, None, 'com'), + (None, None, 'com'), (None, None, None))) + self.validate_components(domain='.com', + expected=((None, None, 'com'), (None, None, 'com'), + (None, None, 'com'), (None, None, None))) + self.validate_components(domain='com.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.com', + expected=((None, 'example', 'com'), (None, 'example', 'com'), + (None, 'example', 'com'), (None, None, None))) + self.validate_components(domain='example..com', + expected=(('example', None, 'com'), ('example', None, 'com'), + ('example', None, 'com'), (None, None, None))) + self.validate_components(domain='.example.com', + expected=((None, 'example', 'com'), (None, 'example', 'com'), + (None, 'example', 'com'), (None, None, None))) + self.validate_components(domain='.example.com.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.com', + expected=(('www', 'example', 'com'), ('www', 'example', 'com'), + ('www', 'example', 'com'), (None, None, None))) + self.validate_components(domain='www.en.example.com', + expected=(('www.en', 'example', 'com'), ('www.en', 'example', 'com'), + ('www.en', 'example', 'com'), (None, None, None))) + + def test_08a_negated_wildcard(self): + self.psl = ps2.PublicSuffixList(['!*']) + + self.validate_tld_empty_label() + self.validate_tld(tld='abc', expected=(None, None, 'abc', None)) + self.validate_tld(tld='aBc', expected=(None, None, 'abc', None)) + self.validate_tld(tld='com', expected=(None, None, 'com', None)) + self.validate_tld(tld='cOm', expected=(None, None, 'com', None)) + + def test_08b_negated_wildcard(self): + self.psl = ps2.PublicSuffixList(['!*']) + + # validate the behavior with negated TLD + self.validate_sld(domain='org', expected=(None, None, 'org', None)) + self.validate_sld(domain='.org', expected=(None, None, None, None)) + self.validate_sld(domain='org.', expected=(None, None, None, None)) + self.validate_sld(domain='example..org', expected=(None, None, None, None)) + self.validate_sld(domain='example.org.', expected=(None, None, None, None)) + self.validate_sld(domain='example.org', expected=(None, None, 'example.org', None)) + self.validate_sld(domain='.exAMPle.org', expected=(None, None, 'example.org', None)) + self.validate_sld(domain='www.example.org', expected=(None, None, 'example.org', None)) + self.validate_sld(domain='www.en.example.org', expected=(None, None, 'example.org', None)) + + def test_08c_negated_wildcard(self): + self.psl = ps2.PublicSuffixList(['!*']) + + self.validate_components(domain='org', + expected=((None, None, None), (None, None, None), + (None, None, 'org'), (None, None, None))) + self.validate_components(domain='.org', + expected=((None, None, None), (None, None, None), + (None, None, 'org'), (None, None, None))) + self.validate_components(domain='org.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.org', + expected=((None, None, None), (None, None, None), + (None, 'example', 'org'), (None, None, None))) + self.validate_components(domain='example..org', + expected=((None, None, None), (None, None, None), + ('example', None, 'org'), (None, None, None))) + self.validate_components(domain='.example.org', + expected=((None, None, None), (None, None, None), + (None, 'example', 'org'), (None, None, None))) + self.validate_components(domain='.example.org.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.org', + expected=((None, None, None), (None, None, None), + ('www', 'example', 'org'), (None, None, None))) + self.validate_components(domain='www.en.example.org', + expected=((None, None, None), (None, None, None), + ('www.en', 'example', 'org'), (None, None, None))) + + def test_09_simple_list_wildcard(self): + self.psl = ps2.PublicSuffixList(['*', 'com']) + self.validate_tld_empty_label() + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='abc', expected=('abc', 'abc', 'abc', None)) + + def test_10_simple_list_negated_wildcard(self): + self.psl = ps2.PublicSuffixList(['!*', 'com']) + self.validate_tld_empty_label() + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='abc', expected=(None, None, 'abc', None)) + + def test_11a_wildcard_etld(self): + self.psl = ps2.PublicSuffixList(['*.uk', 'com']) + + self.validate_tld_empty_label() + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='co.uk', expected=('co.uk', 'co.uk', 'uk', 'uk')) + self.validate_tld(tld='gov.uk', expected=('gov.uk', 'gov.uk', 'uk', 'uk')) + + def test_11b_wildcard_etld(self): + self.psl = ps2.PublicSuffixList(['*.uk']) + + # TLD of eTLD is automatically registered + self.validate_sld(domain='uk', expected=('uk', 'uk', 'uk', 'uk')) + self.validate_sld(domain='.uk', expected=(None, None, None, None)) + self.validate_sld(domain='uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example..uk', expected=(None, None, None, None)) + self.validate_sld(domain='example.uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example.uk', + expected=('example.uk', 'example.uk', 'example.uk', 'example.uk')) + self.validate_sld(domain='.exAMPle.uk', + expected=(None, None, 'example.uk', 'example.uk')) + self.validate_sld(domain='www.example.uk', + expected=('www.example.uk', 'www.example.uk', 'example.uk', 'example.uk')) + self.validate_sld(domain='www.en.example.uk', + expected=('en.example.uk', 'en.example.uk', 'example.uk', 'example.uk')) + + # validate ETLD + self.validate_sld(domain='co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='.co.uk', expected=(None, None, 'co.uk', 'co.uk')) + self.validate_sld(domain='co.uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example..co.uk', expected=(None, None, 'co.uk', 'co.uk')) + self.validate_sld(domain='example.co.uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example.co.uk', + expected=('example.co.uk', 'example.co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='.exAMPle.co.uk', + expected=('example.co.uk', 'example.co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='www.example.co.uk', + expected=('example.co.uk', 'example.co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='www.en.example.co.uk', + expected=('example.co.uk', 'example.co.uk', 'co.uk', 'co.uk')) + + def test_11c_wildcard_etld(self): + self.psl = ps2.PublicSuffixList(['*.uk']) + + # TLD of ETLD is auto-registered + self.validate_components(domain='uk', + expected=((None, None, 'uk'), (None, None, 'uk'), + (None, None, 'uk'), (None, None, 'uk'))) + self.validate_components(domain='.uk', + expected=((None, None, 'uk'), (None, None, 'uk'), + (None, None, 'uk'), (None, None, 'uk'))) + self.validate_components(domain='uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.uk', + expected=((None, None, 'example.uk'), (None, None, 'example.uk'), + (None, 'example', 'uk'), (None, 'example', 'uk'))) + self.validate_components(domain='example..uk', + expected=(('example', None, 'uk'), ('example', None, 'uk'), + ('example', None, 'uk'), ('example', None, 'uk'))) + self.validate_components(domain='.example.uk', + expected=((None, None, 'example.uk'), (None, None, 'example.uk'), + (None, 'example', 'uk'), (None, 'example', 'uk'))) + self.validate_components(domain='.example.uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.uk', + expected=((None, 'www', 'example.uk'), (None, 'www', 'example.uk'), + ('www', 'example', 'uk'), ('www', 'example', 'uk'))) + self.validate_components(domain='www.en.example.uk', + expected=(('www', 'en', 'example.uk'), ('www', 'en', 'example.uk'), + ('www.en', 'example', 'uk'), ('www.en', 'example', 'uk'))) + + # Validate wildcard ETLD + self.validate_components(domain='co.uk', + expected=((None, None, 'co.uk'), (None, None, 'co.uk'), + (None, 'co', 'uk'), (None, 'co', 'uk'))) + self.validate_components(domain='.co.uk', + expected=((None, None, 'co.uk'), (None, None, 'co.uk'), + (None, 'co', 'uk'), (None, 'co', 'uk'))) + self.validate_components(domain='co.uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.co.uk', + expected=((None, 'example', 'co.uk'), (None, 'example', 'co.uk'), + ('example', 'co', 'uk'), ('example', 'co', 'uk'))) + self.validate_components(domain='example..co.uk', + expected=(('example', None, 'co.uk'), ('example', None, 'co.uk'), + ('example.', 'co', 'uk'), ('example.', 'co', 'uk'))) + self.validate_components(domain='.example.co.uk', + expected=((None, 'example', 'co.uk'), (None, 'example', 'co.uk'), + ('.example', 'co', 'uk'), ('.example', 'co', 'uk'))) + self.validate_components(domain='.example.co.uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.co.uk', + expected=(('www', 'example', 'co.uk'), ('www', 'example', 'co.uk'), + ('www.example', 'co', 'uk'), ('www.example', 'co', 'uk'))) + self.validate_components(domain='www.en.example.co.uk', + expected=(('www.en', 'example', 'co.uk'), ('www.en', 'example', 'co.uk'), + ('www.en.example', 'co', 'uk'), ('www.en.example', 'co', 'uk'))) + + def test_12_wildcard_list_negated_etld(self): + self.psl = ps2.PublicSuffixList(['*.uk', '!co.uk', 'com']) + self.validate_tld_empty_label() + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='gov.uk', expected=('gov.uk', 'gov.uk', 'uk', 'uk')) + self.validate_tld(tld='co.uk', expected=('uk', 'uk', 'uk', 'uk')) + + def test_13a_negated_wildcard_etld(self): + self.psl = ps2.PublicSuffixList(['!*.uk', 'co.uk', 'com']) + self.validate_tld_empty_label() + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_tld(tld='gov.uk', expected=('uk', 'uk', 'uk', 'uk')) + + def test_13b_negated_wildcard_etld(self): + self.psl = ps2.PublicSuffixList(['!*.uk']) + + # TLD of eTLD is automatically registered + self.validate_sld(domain='uk', expected=('uk', 'uk', 'uk', 'uk')) + self.validate_sld(domain='.uk', expected=(None, None, None, None)) + self.validate_sld(domain='uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example..uk', expected=(None, None, None, None)) + self.validate_sld(domain='example.uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example.uk', + expected=('example.uk', 'example.uk', 'example.uk', 'example.uk')) + self.validate_sld(domain='.exAMPle.uk', + expected=('example.uk', 'example.uk', 'example.uk', 'example.uk')) + self.validate_sld(domain='www.example.uk', + expected=('example.uk', 'example.uk', 'example.uk', 'example.uk')) + self.validate_sld(domain='www.en.example.uk', + expected=('example.uk', 'example.uk', 'example.uk', 'example.uk')) + + # validate the behavior with negated TLD + self.validate_sld(domain='co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='.co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='co.uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example..co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='example.co.uk.', expected=(None, None, None, None)) + self.validate_sld(domain='example.co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='.exAMPle.co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='www.example.co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_sld(domain='www.en.example.co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + + def test_13c_negated_wildcard_etld(self): + self.psl = ps2.PublicSuffixList(['!*.uk']) + + # TLD of ETLD is auto-registered and positive even in the case of negated wildcard + self.validate_components(domain='uk', + expected=((None, None, 'uk'), (None, None, 'uk'), + (None, None, 'uk'), (None, None, 'uk'))) + self.validate_components(domain='.uk', + expected=((None, None, 'uk'), (None, None, 'uk'), + (None, None, 'uk'), (None, None, 'uk'))) + self.validate_components(domain='uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.uk', + expected=((None, 'example', 'uk'), (None, 'example', 'uk'), + (None, 'example', 'uk'), (None, 'example', 'uk'))) + self.validate_components(domain='example..uk', + expected=(('example', None, 'uk'), ('example', None, 'uk'), + ('example', None, 'uk'), ('example', None, 'uk'))) + self.validate_components(domain='.example.uk', + expected=((None, 'example', 'uk'), (None, 'example', 'uk'), + (None, 'example', 'uk'), (None, 'example', 'uk'))) + self.validate_components(domain='.example.uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.uk', + expected=(('www', 'example', 'uk'), ('www', 'example', 'uk'), + ('www', 'example', 'uk'), ('www', 'example', 'uk'))) + self.validate_components(domain='www.en.example.uk', + expected=(('www.en', 'example', 'uk'), ('www.en', 'example', 'uk'), + ('www.en', 'example', 'uk'), ('www.en', 'example', 'uk'))) + + # Verify negated wildcard etld + self.validate_components(domain='co.uk', + expected=((None, 'co', 'uk'), (None, 'co', 'uk'), + (None, 'co', 'uk'), (None, 'co', 'uk'))) + self.validate_components(domain='.co.uk', + expected=((None, 'co', 'uk'), (None, 'co', 'uk'), + (None, 'co', 'uk'), (None, 'co', 'uk'))) + self.validate_components(domain='co.uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='example.co.uk', + expected=(('example', 'co', 'uk'), ('example', 'co', 'uk'), + ('example', 'co', 'uk'), ('example', 'co', 'uk'))) + self.validate_components(domain='example..co.uk', + expected=(('example.', 'co', 'uk'), ('example.', 'co', 'uk'), + ('example.', 'co', 'uk'), ('example.', 'co', 'uk'))) + self.validate_components(domain='.example.co.uk', + expected=(('.example', 'co', 'uk'), ('.example', 'co', 'uk'), + ('.example', 'co', 'uk'), ('.example', 'co', 'uk'))) + self.validate_components(domain='.example.co.uk.', + expected=((None, None, None), (None, None, None), + (None, None, None), (None, None, None))) + self.validate_components(domain='www.example.co.uk', + expected=(('www.example', 'co', 'uk'), ('www.example', 'co', 'uk'), + ('www.example', 'co', 'uk'), ('www.example', 'co', 'uk'))) + self.validate_components(domain='www.en.example.co.uk', + expected=(('www.en.example', 'co', 'uk'), ('www.en.example', 'co', 'uk'), + ('www.en.example', 'co', 'uk'), ('www.en.example', 'co', 'uk'))) + + def test_14_complex_list(self): + self.psl = ps2.PublicSuffixList(['com', '!org', '!*.uk', 'co.uk', '*.us', '!ca.us', '*.ng']) + self.validate_tld_empty_label() + self.validate_tld(tld='abc', expected=('abc', None, 'abc', None)) + self.validate_tld(tld='com', expected=('com', 'com', 'com', 'com')) + self.validate_tld(tld='org', expected=(None, None, None, None)) + self.validate_tld(tld='co.uk', expected=('co.uk', 'co.uk', 'co.uk', 'co.uk')) + self.validate_tld(tld='gov.uk', expected=('uk', 'uk', 'uk', 'uk')) + self.validate_tld(tld='wa.us', expected=('wa.us', 'wa.us', 'us', 'us')) + self.validate_tld(tld='ca.us', expected=('us', 'us', 'us', 'us')) + self.validate_tld(tld='abc.ng', expected=('abc.ng', 'abc.ng', 'ng', 'ng'))