Skip to content

Commit 723bdd3

Browse files
committed
修复tldextract
cache问题(john-kurkowski/tldextract#254)
1 parent 06462b1 commit 723bdd3

File tree

4 files changed

+19
-8
lines changed

4 files changed

+19
-8
lines changed

libs/regex.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding:utf-8 -*-
33
import re
4-
import tldextract
4+
from utils.hostsplit import domextract
55

66

77
ipv4 = re.compile(r"^((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)(\.((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)){3}$")
@@ -47,13 +47,13 @@ def is_valid_ip(text):
4747

4848

4949
def is_valid_domain(text):
50-
if domain.match(text) and "." in text[-7:] and tldextract.extract(text).suffix != "":
50+
if domain.match(text) and "." in text[-7:] and domextract(text).suffix != "":
5151
return True
5252
return False
5353

5454

5555
def maybe_url(text):
56-
if url.match(text) and tldextract.extract(text).suffix != "":
56+
if url.match(text) and domextract(text).suffix != "":
5757
return True
5858
return False
5959

@@ -68,7 +68,7 @@ def find_domains(text):
6868
domains = set()
6969
for item in domain_find_regex.findall(text):
7070
# in general, domain suffix length less than 6.
71-
if "." in item[-7:] and tldextract.extract(item).suffix != "":
71+
if "." in item[-7:] and domextract(item).suffix != "":
7272
domains.add(item)
7373
return list(domains)
7474

libs/web/url.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
# -*- coding:utf-8 -*-
33
import os
44
import re
5-
import tldextract
65
import html as htmlparser
76
from urllib.parse import unquote
87
from collections import namedtuple
98
from urllib.parse import urlparse
109
from libs.regex import html, common_dom
10+
from utils.hostsplit import domextract
1111

1212

1313
def normal_url(url):
@@ -25,7 +25,7 @@ def urlsite(url):
2525
if re.match(r'^\w+://', url):
2626
site = urlparse(url).netloc
2727
#
28-
ext = tldextract.extract(url)
28+
ext = domextract(url)
2929
if not ext.registered_domain:
3030
return UrlSiteResult(subdomain='', domain='', suffix='',
3131
reg_domain='', hostname=site)

tools/alexa_bloom.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#!/usr/bin/env python
22
# -*- coding:utf-8 -*-
33
import os
4-
import tldextract
54
from pybloom_live import BloomFilter
65
from utils.filedir import reader
6+
from utils.hostsplit import domextract
77
from conf.paths import PRIVATE_RESOURCE_HOME
88
from conf.paths import ALEXA_BLOOM_FILTER_PATH
99

@@ -25,7 +25,7 @@ def check(hosts):
2525
bloom = BloomFilter.fromfile(fopen)
2626
for host in hosts:
2727
host = host.lower()
28-
reg_domain = tldextract.extract(host).registered_domain
28+
reg_domain = domextract(host).registered_domain
2929
if not reg_domain:
3030
reg_domain = host
3131
if reg_domain in bloom:

utils/hostsplit.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/usr/bin/env python
2+
# -*- coding:utf-8 -*-
3+
import tldextract
4+
5+
6+
domextract = tldextract.TLDExtract(cache_dir=False)
7+
8+
9+
10+
11+

0 commit comments

Comments
 (0)