forked from wpzzz/blocked-sites-in-south-korea
-
Notifications
You must be signed in to change notification settings - Fork 0
/
check.py
94 lines (82 loc) · 3.25 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import asyncio
import aiohttp
import time
import logging
from aiohttp import TCPConnector
# 并发限制
CONCURRENT_REQUESTS = 50
# 连接池大小
CONNECTION_LIMIT = 100
# 每批次处理的 URL 数量
BATCH_SIZE = 1000
# 设置日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[
logging.FileHandler("log.log"),
logging.StreamHandler()
])
async def fetch_url(session, url, output_file):
start_time = time.time()
try:
async with session.get(url, timeout=5, allow_redirects=False) as response:
end_time = time.time()
duration = end_time - start_time
if response.status == 200:
text = await response.text()
if "warning.or.kr/i1.html" in text:
logging.info(f"Keyword found in {url} (took {duration:.2f} seconds)")
with open(output_file, 'a', encoding='utf-8') as f:
f.write(f"{url} (took {duration:.2f} seconds)\n")
elif response.status in (301, 302, 303, 307, 308):
location = response.headers.get('Location')
logging.info(f"Redirect found for {url} to {location} (took {duration:.2f} seconds)")
else:
logging.info(f"Non-200 status code {response.status} for {url} (took {duration:.2f} seconds)")
except Exception as e:
logging.error(f"Error accessing {url}: {e}")
async def bound_fetch(sem, session, url, output_file):
async with sem:
await fetch_url(session, url, output_file)
def clean_url(url):
if not url.startswith(('http://', 'https://')):
url = 'http://' + url
return url
async def process_urls(urls, output_file):
sem = asyncio.Semaphore(CONCURRENT_REQUESTS)
connector = TCPConnector(limit=CONNECTION_LIMIT)
async with aiohttp.ClientSession(connector=connector) as session:
tasks = []
for url in urls:
cleaned_url = clean_url(url.strip())
if cleaned_url:
task = bound_fetch(sem, session, cleaned_url, output_file)
tasks.append(task)
try:
await asyncio.gather(*tasks)
except asyncio.CancelledError:
logging.error("Tasks were cancelled due to asyncio.CancelledError")
for task in tasks:
if not task.done():
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
async def process_file_in_batches(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as file:
batch = []
for line in file:
batch.append(line.strip())
if len(batch) >= BATCH_SIZE:
await process_urls(batch, output_file)
batch = []
if batch:
await process_urls(batch, output_file)
def main():
input_file = 'domains.txt'
output_file = 'output.txt'
# 清空输出文件
open(output_file, 'w').close()
# 创建事件循环并运行处理
loop = asyncio.get_event_loop()
loop.run_until_complete(process_file_in_batches(input_file, output_file))
if __name__ == '__main__':
start_time = time.time()
main()
logging.info(f"Completed in {time.time() - start_time} seconds")