File tree Expand file tree Collapse file tree 4 files changed +19
-8
lines changed Expand file tree Collapse file tree 4 files changed +19
-8
lines changed Original file line number Diff line number Diff line change 1
1
#!/usr/bin/env python
2
2
# -*- coding:utf-8 -*-
3
3
import re
4
- import tldextract
4
+ from utils . hostsplit import domextract
5
5
6
6
7
7
ipv4 = re .compile (r"^((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)(\.((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d)){3}$" )
@@ -47,13 +47,13 @@ def is_valid_ip(text):
47
47
48
48
49
49
def is_valid_domain (text ):
50
- if domain .match (text ) and "." in text [- 7 :] and tldextract . extract (text ).suffix != "" :
50
+ if domain .match (text ) and "." in text [- 7 :] and domextract (text ).suffix != "" :
51
51
return True
52
52
return False
53
53
54
54
55
55
def maybe_url (text ):
56
- if url .match (text ) and tldextract . extract (text ).suffix != "" :
56
+ if url .match (text ) and domextract (text ).suffix != "" :
57
57
return True
58
58
return False
59
59
@@ -68,7 +68,7 @@ def find_domains(text):
68
68
domains = set ()
69
69
for item in domain_find_regex .findall (text ):
70
70
# in general, domain suffix length less than 6.
71
- if "." in item [- 7 :] and tldextract . extract (item ).suffix != "" :
71
+ if "." in item [- 7 :] and domextract (item ).suffix != "" :
72
72
domains .add (item )
73
73
return list (domains )
74
74
Original file line number Diff line number Diff line change 2
2
# -*- coding:utf-8 -*-
3
3
import os
4
4
import re
5
- import tldextract
6
5
import html as htmlparser
7
6
from urllib .parse import unquote
8
7
from collections import namedtuple
9
8
from urllib .parse import urlparse
10
9
from libs .regex import html , common_dom
10
+ from utils .hostsplit import domextract
11
11
12
12
13
13
def normal_url (url ):
@@ -25,7 +25,7 @@ def urlsite(url):
25
25
if re .match (r'^\w+://' , url ):
26
26
site = urlparse (url ).netloc
27
27
#
28
- ext = tldextract . extract (url )
28
+ ext = domextract (url )
29
29
if not ext .registered_domain :
30
30
return UrlSiteResult (subdomain = '' , domain = '' , suffix = '' ,
31
31
reg_domain = '' , hostname = site )
Original file line number Diff line number Diff line change 1
1
#!/usr/bin/env python
2
2
# -*- coding:utf-8 -*-
3
3
import os
4
- import tldextract
5
4
from pybloom_live import BloomFilter
6
5
from utils .filedir import reader
6
+ from utils .hostsplit import domextract
7
7
from conf .paths import PRIVATE_RESOURCE_HOME
8
8
from conf .paths import ALEXA_BLOOM_FILTER_PATH
9
9
@@ -25,7 +25,7 @@ def check(hosts):
25
25
bloom = BloomFilter .fromfile (fopen )
26
26
for host in hosts :
27
27
host = host .lower ()
28
- reg_domain = tldextract . extract (host ).registered_domain
28
+ reg_domain = domextract (host ).registered_domain
29
29
if not reg_domain :
30
30
reg_domain = host
31
31
if reg_domain in bloom :
Original file line number Diff line number Diff line change
1
+ #!/usr/bin/env python
2
+ # -*- coding:utf-8 -*-
3
+ import tldextract
4
+
5
+
6
+ domextract = tldextract .TLDExtract (cache_dir = False )
7
+
8
+
9
+
10
+
11
+
You can’t perform that action at this time.
0 commit comments