From 2fa1b9c5f875587a5b5f9cfdb030454fb8251242 Mon Sep 17 00:00:00 2001 From: Lee CQ Date: Sun, 27 Aug 2023 17:25:26 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E5=B0=8F=E8=AF=B4=20?= =?UTF-8?q?=E4=BD=BF=E7=94=A8selenium=20=E6=94=AF=E6=8C=81=E7=AB=A0?= =?UTF-8?q?=E8=8A=82=E5=88=86=E9=A1=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- actions/down_txt/chrome.py | 6 + actions/down_txt/config.py | 82 ++++++++++++ actions/down_txt/down_book.py | 211 ++++++++++++++++++++----------- actions/down_txt/down_list.py | 51 ++++++++ actions/down_txt/learn_chrome.py | 127 +++++++++++++++++++ actions/down_txt/main.py | 21 +++ actions/down_txt/replace.py | 25 +++- 7 files changed, 444 insertions(+), 79 deletions(-) create mode 100644 actions/down_txt/config.py create mode 100644 actions/down_txt/down_list.py create mode 100644 actions/down_txt/learn_chrome.py create mode 100644 actions/down_txt/main.py diff --git a/actions/down_txt/chrome.py b/actions/down_txt/chrome.py index b6a652e..c19c1a4 100644 --- a/actions/down_txt/chrome.py +++ b/actions/down_txt/chrome.py @@ -12,6 +12,7 @@ def chrome(executable_path='chromedriver', + window_sizes: tuple = None, is_headless: bool = False, headless=None, is_maximized: bool = False, maximized=None, is_incognito: bool = False, incognito=None, @@ -25,6 +26,7 @@ def chrome(executable_path='chromedriver', :param ua: :param js: + :param window_sizes: 窗口大小 :param maximized: 窗口最大化 :param notification: 通知 :param pic: 显示图片 @@ -62,6 +64,8 @@ def chrome(executable_path='chromedriver', browser = Chrome(service=__service, options=__opt, ) + if window_sizes: + browser.set_window_size(*window_sizes) return browser @@ -93,4 +97,6 @@ def _option(headless, maximized, incognito, js, ua, pic, notifications, **kwargs option.add_argument(_ if isinstance(_, str) else '') option.add_experimental_option('prefs', {'profile.default_content_setting_values': _pre}) + option.add_experimental_option('excludeSwitches', ['enable-automation']) + option.add_argument("--disable-blink-features=AutomationControlled") # 这里添加一些启动的参数 return option diff --git a/actions/down_txt/config.py b/actions/down_txt/config.py new file mode 100644 index 0000000..eea3f9f --- /dev/null +++ b/actions/down_txt/config.py @@ -0,0 +1,82 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +""" +@File Name : config.py.py +@Author : LeeCQ +@Date-Time : 2023/7/1 13:01 +""" +import logging +from dataclasses import dataclass + + +@dataclass +class BookCss: + BOOK_NAME: str + BOOK_AUTHOR: str + BOOK_LIST: str + BOOK_CONTENT: str + BOOK_LIST_NEXT: str = None + BOOK_CONTENT_NEXT: str = None + + +CSS_CONFIG = { + "www.biququ.la": { + "BOOK_NAME": "#info > h1", + "BOOK_AUTHOR": "#info > p:nth-child(2)", + "BOOK_LIST": "#list > dl > dd > a", + "BOOK_CONTENT": "#content", + }, + "qushu.org": { + "BOOK_NAME": "body > div.container.autoheight > div.list-chapter > h1 > a", + "BOOK_AUTHOR": "body > div.container.autoheight > div.list-chapter > h2 > a:nth-child(2)", + "BOOK_LIST": "body > div.container.autoheight > div.list-chapter > div.booklist > ul > li > a", + "BOOK_LIST_NEXT": "body > div.container.autoheight > div.list-chapter > div.booklist > div:nth-child(2) > span.right > a", + "BOOK_CONTENT": "#chaptercontent", + "BOOK_CONTENT_NEXT": "#chaptercontent > p > a", + } +} + + +def css_finder(domain: str) -> BookCss: + return BookCss(**CSS_CONFIG.get(domain)) + + +LOG_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "simple": { + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + }, + }, + # "filters": {}, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": "DEBUG", + "formatter": "simple", + }, + "file": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "simple", + "filename": "down_book.log", + "maxBytes": 1024 * 1024 * 5, + "backupCount": 5, + "encoding": "utf-8" + }, + + }, + "loggers": { + "gh-actions": { + "level": "DEBUG", + "handlers": ["console"], + "propagate": False, + }, + }, + "root": { + "level": "DEBUG", + "handlers": ["file"], + }, + +} diff --git a/actions/down_txt/down_book.py b/actions/down_txt/down_book.py index 586944e..2512e99 100644 --- a/actions/down_txt/down_book.py +++ b/actions/down_txt/down_book.py @@ -12,20 +12,42 @@ import sys import threading import time +import re +import urllib.parse from pathlib import Path -from queue import Queue, Empty +from queue import Queue as _Queue, Empty +import pymysql from dotenv import load_dotenv from selenium.webdriver.common.by import By -from selenium.common.exceptions import NoSuchWindowException, NoSuchElementException +from selenium.common.exceptions import NoSuchWindowException, NoSuchElementException, WebDriverException +from selenium.webdriver.common.action_chains import ActionChains +from sqllib.common.error import SqlWriteError from sqllib import MySqlAPI, SQLiteAPI, BaseSQLAPI from chrome import chrome from replace import replace, replace_from_sql +from config import css_finder logger = logging.getLogger("gh-actions.down-txt") +def is_url(url) -> bool: + """判断是否为URL""" + return re.match(r'^https?://', url) is not None + + +class Queue(_Queue): + + def get_all(self): + """获取队列中所有的数据""" + try: + while True: + yield self.get_nowait() + except Empty: + pass + + class DownTxt: sql: BaseSQLAPI = None book_name: str = None @@ -38,12 +60,15 @@ def __init__(self, url, _sql=None): pic=False, notification=False, # is_headless=True, - ua='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35' + ua='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35' ) + self.browser.set_window_size(800, 600) self.set_sql(_sql) self.body_queue = Queue() self.have_new_tab = True self.have_new_body = True + self.css = css_finder(urllib.parse.urlsplit(self.url).netloc) def set_sql(self, _sql: BaseSQLAPI = None): if _sql: @@ -56,10 +81,11 @@ def create_table(self): if not self.book_name: raise Exception('请先获取书籍名称') _c = (f"CREATE TABLE IF NOT EXISTS `{self.book_name}` ( " - f"idd INT(10) NOT NULL UNIQUE, " - f"url VARCHAR(50) NOT NULL UNIQUE, " - f"title VARCHAR(99), " - f"body VARCHAR(16000)" + f"_id INT(10) NOT NULL PRIMARY KEY AUTO_INCREMENT, " + f"idd INT(10) , " + f"url VARCHAR(255) NOT NULL , " + f"title VARCHAR(512), " + f"body TEXT" f" ) ") return self.sql.write_db(_c) @@ -71,8 +97,33 @@ def verify_bot(self): if self.browser.title != 'Just a moment...': break time.sleep(1) + self.browser.switch_to.frame(self.browser.find_elements(By.TAG_NAME, "iframe")[0]) + ActionChains(self.browser).move_by_offset(60, 300).click().perform() + ActionChains(self.browser).move_to_element( + self.browser.find_element(By.CSS_SELECTOR, '#recaptcha-anchor')).click().perform() + self.browser.find_element(By.CSS_SELECTOR, '#verifying').click() logger.info('验证完成') + def find_list(self): + """获取章节列表""" + next_url = self.url + while True: + try: + if not is_url(next_url): + break + self.browser.get(next_url) + self.browser.implicitly_wait(10) + for item in self.browser.find_elements(By.CSS_SELECTOR, self.css.BOOK_LIST): + yield item.get_attribute('href'), item.text + if self.css.BOOK_LIST_NEXT: + next_url = self.browser.find_element(By.CSS_SELECTOR, self.css.BOOK_LIST_NEXT).get_attribute('href') + else: + break + except NoSuchElementException or WebDriverException: + logger.info('分页结束') + break + yield "END", "END" + # noinspection PyProtectedMember def create_metadata(self): """获取URL头""" @@ -81,34 +132,29 @@ def create_metadata(self): if self.browser.title == 'Just a moment...': logger.warning('需要验证') self.verify_bot() - self.book_name = f"{self.browser.find_element(By.CSS_SELECTOR, '#info > p:nth-child(2)').text.replace('作 者:', '')}-" \ - f"{self.browser.find_element(By.CSS_SELECTOR, '#info > h1').text}-" \ + self.book_name = f"{self.browser.find_element(By.CSS_SELECTOR, self.css.BOOK_AUTHOR).text.replace(' ', '').replace('作者:', '')}-" \ + f"{self.browser.find_element(By.CSS_SELECTOR, self.css.BOOK_NAME).text}-" \ f"{time.strftime('%Y%m', time.localtime())}" self.create_table() - logger.info('Book Name: %s' % self.book_name) - - idd_s = [x[0] for x in self.sql.read_db(f"SELECT idd FROM `{self.book_name}`")] - logger.info('已经写入的章节信息数量: %d' % len(idd_s)) + logger.info('Book Name: %s', self.book_name) - chs = self.browser.find_elements(By.CSS_SELECTOR, '#list > dl > dd > a') - logger.info('最新章节数量: %d' % len(chs)) + url_s = [x[0] for x in self.sql.read_db(f"SELECT url FROM `{self.book_name}`")] + logger.info('已经写入的章节信息数量: %d' % len(url_s)) - if len(idd_s) == len(chs): - logger.info('已经下载完成') + if 'END' in url_s: + logger.info('元数据信息初始化 - 已经完成') return cursor = self.sql._sql.cursor() - for i in self.browser.find_elements(By.CSS_SELECTOR, '#list > dl > dd > a'): - _url = i.get_attribute('href') - _id = _url.split('/')[-1].split('.')[0] - _name = i.text - if int(_id) in idd_s: + for _url, _name in self.find_list(): + if _url in url_s: continue logger.info(f'创建元数据章节:{_name}, {_url}') - cursor.execute(f"INSERT INTO `{self.book_name}` (idd, url, title) VALUES ('{_id}', '{_url}', '{_name}')") + cursor.execute(f"INSERT INTO `{self.book_name}` (url, title) VALUES ( '{_url}', '{_name}')", ) self.sql._sql.commit() + logger.info('元数据信息初始化 - 完成') async def open_tab(self): """异步打开标签页""" @@ -125,14 +171,17 @@ async def open_tab(self): for i, url in enumerate(urls): url = url[0] while len(self.browser.window_handles) >= 5 + 1: - await asyncio.sleep(0.1) + await asyncio.sleep(0.01) logger.info('Will Open %d %s' % (i, url)) - self.browser.switch_to.window(self.browser.window_handles[-1]) + self.browser.switch_to.window(self.browser.window_handles[0]) self.browser.execute_script(f'window.open("{url}","_blank");') - + # self.browser.switch_to.new_window() + # self.browser.get(url) self.body_queue.join() # 等待列队中的数据全部写入数据库 await self.open_tab() # 递归检查是否还有未下载的章节 + # def _get_body(self, url): + async def get_body(self, chapter_window): """获取正文 @@ -142,13 +191,11 @@ async def get_body(self, chapter_window): while self.have_new_tab: for i in self.browser.window_handles: await asyncio.sleep(0.001) - if i == chapter_window: continue try: self.browser.switch_to.window(i) - if self.browser.title == 'Just a moment...': logger.warning('需要验证') self.verify_bot() @@ -157,39 +204,43 @@ async def get_body(self, chapter_window): _body = '\n'.join( i.text for i in self.browser.find_elements( By.CSS_SELECTOR, - '#content > p') + self.css.BOOK_CONTENT) ) - _title = self.browser.title - _url = self.browser.current_url - self.browser.close() + + if _body: + self.body_queue.put((self.browser.current_url, replace(_body))) + logger.info(f'获取正文成功:{self.browser.title} 长度:{len(_body)}') + self.browser.close() + else: + logger.error(f'获取正文失败:{self.browser.title}') except (NoSuchWindowException, NoSuchElementException) as _e: logger.info(f'EE: {type(_e)}') continue - - if _body: - self.body_queue.put((_url, _title, replace(_body))) - logger.info(f'获取正文成功:{_title} 长度:{len(_body)}') - else: - logger.error(f'获取正文失败:{_title}') - continue else: logger.info('获取正文完成 ...') self.have_new_body = False def write_body_sql(self): """写入正文到数据库""" + while True: + if self.have_new_body is False: + logger.info('数据写入完成 ...') + break try: - _url, _title, _body = self.body_queue.get(timeout=2) - self.sql.write_db(f"UPDATE `{self.book_name}` SET body='{_body}' WHERE url='{_url}'") + all_queue = [dict(u=u, b=b) for u, b in self.body_queue.get_all()] # i: (url, body) + if not all_queue: + logger.debug("等待写入正文 ...") + time.sleep(1) + continue + logger.debug('从列队中获取正文: %d', len(all_queue)) + _rows = self.sql.write_rows(f"UPDATE `{self.book_name}` SET body=%(b)s WHERE url=%(u)s", all_queue) + # _url, _title, _body = self.body_queue.get(timeout=2) + # self.sql.update(self.book_name, 'url', _url, body=_body) self.body_queue.task_done() - logger.info('写入正文成功:%s' % _title) - except Empty: - if self.have_new_body is False: - logger.info('数据写入完成 ...') - break - else: - logger.warning('等待新数据写入列队 ...') + logger.info('写入正文成功:影响 %d 行, (剩余列队长度: %d)', _rows, self.body_queue.qsize()) + except (pymysql.err.ProgrammingError, SqlWriteError) as _e: + logger.warning('SQL 写入失败: %s' % _e) def async_write_body(self): """""" @@ -234,50 +285,66 @@ def replace_from_sql(self): return replace_from_sql(self.sql, self.book_name) -def down_txt(url): +def init_sql(sql_connect=None): + # load_dotenv(Path(__file__).parent / '.env') + if sql_connect is None: + try: + mysql_info = json.loads(os.getenv('MYSQL_INFO')) + sql_connect = MySqlAPI( + **mysql_info, + charset='gb18030', + use_unicode=True, + pool=True, + ) + logger.info('使用mysql, Host: %s' % mysql_info['host']) + except Exception: + logger.warning('未配置数据库,使用sqlite') + return sql_connect + + +def down_txt(url, sql_connect=None): + """下载小说""" + if sql_connect is None: + sql_connect = init_sql() a = DownTxt(url) - a.set_sql(_sql=sql) + a.set_sql(sql_connect) a.create_metadata() a.async_write_body() - if os.getenv('REPLACE_FROM_SQL', False): a.replace_from_sql() a.merge_txt() a.github_env() -def replace_sql(table_name): +def replace_sql(table_name, sql_connect=None): + """""" + if sql_connect is None: + sql_connect = init_sql() a = DownTxt('') - a.set_sql(_sql=sql) + a.set_sql(sql_connect) a.book_name = table_name a.replace_from_sql() + a.merge_txt() if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - stream=sys.stderr, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) + import logging.config + from config import LOG_CONFIG + + logging.config.dictConfig(LOG_CONFIG) - load_dotenv(Path(__file__).parent / '.env') + if load_dotenv(Path(__file__).parent / '.env'): + logger.info('加载环境变量成功') - DOWNLOAD_LINK = os.getenv('DOWNLOAD_LINK') + DOWNLOAD_LINK = sys.argv[1] if len(sys.argv) > 1 else os.getenv('DOWNLOAD_LINK', ) DOWNLOAD_NAME = os.getenv('DOWNLOAD_NAME', '') - try: - mysql_info = json.loads(os.getenv('MYSQL_INFO')) - sql = MySqlAPI( - **mysql_info, - charset='gb18030', - use_unicode=True, - pool=True, - ) - logger.info('使用mysql, Host: %s' % mysql_info['host']) - except Exception as e: - logger.warning('未配置数据库,使用sqlite') - sql = None + REPLACE_NAME = os.getenv('REPLACE_NAME', '') + + logger.info('DOWNLOAD_LINK: %s' % DOWNLOAD_LINK) if DOWNLOAD_LINK: down_txt(DOWNLOAD_LINK) + elif REPLACE_NAME: + replace_sql(REPLACE_NAME) else: logger.warning('未配置下载链接') diff --git a/actions/down_txt/down_list.py b/actions/down_txt/down_list.py new file mode 100644 index 0000000..d029d0f --- /dev/null +++ b/actions/down_txt/down_list.py @@ -0,0 +1,51 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +""" +@File Name : down_list.py +@Author : LeeCQ +@Date-Time : 2023/5/12 20:26 +""" + +import logging +from multiprocessing import Pool +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor + +from dotenv import load_dotenv + +from down_book import down_txt + +__all__ = ['down_list'] + +logger = logging.getLogger('gh-actions.down-txt.down-list') + +辰东 = ['https://www.biququ.info/html/48260', 'https://www.biququ.info/html/27746', 'https://www.biququ.info/html/4234', + 'https://www.biququ.info/html/27743', 'https://www.biququ.info/html/36094', 'https://www.biququ.info/html/27765', + 'https://www.biququ.info/html/27937'] +忘语 = ['https://www.biququ.info/html/1756', 'https://www.biququ.info/html/81678', 'https://www.biququ.info/html/35912', + 'https://www.biququ.info/html/28936', 'https://www.biququ.info/html/44573', 'https://www.biququ.info/html/8245'] +风凌天下 = ['https://www.biququ.info/html/57569', 'https://www.biququ.info/html/32877', 'https://www.biququ.info/html/9814', + 'https://www.biququ.info/html/27794', 'https://www.biququ.info/html/36606', 'https://www.biququ.info/html/6770'] + + +def multi_thread_down(): + """下载列表""" + with ThreadPoolExecutor(max_workers=5) as executor: + executor.map(down_txt, 风凌天下) + + +def multi_process_down(): + """下载列表""" + with Pool(5) as p: + p.map(down_txt, 风凌天下) + + +if __name__ == '__main__': + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + load_dotenv(Path(__file__).parent / '.env') + + multi_thread_down() diff --git a/actions/down_txt/learn_chrome.py b/actions/down_txt/learn_chrome.py new file mode 100644 index 0000000..ff816d6 --- /dev/null +++ b/actions/down_txt/learn_chrome.py @@ -0,0 +1,127 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +""" +@File Name : learn_chrome.py +@Author : LeeCQ +@Date-Time : 2023/5/7 21:25 +""" + +import logging +import threading +import time +import asyncio + +from selenium.common import NoSuchWindowException, NoSuchElementException +from selenium.webdriver.common.by import By + +from chrome import chrome + +logger = logging.getLogger("gh-actions.down-txt.sql") + + +class Learn: + + def __init__(self): + self.browser = chrome(use_js=False, + # pic=False, + # notification=False, + # is_headless=True + ) + + def new_tab(self): + def _open_tab(_urls): + for url in _urls: + # url = url[0] + if len(self.browser.window_handles) >= 10 + 1: + time.sleep(1) + continue + logger.info('Will Open %s' % url) + self.browser.switch_to.window(self.browser.window_handles[-1]) + a = self.browser.execute_script(f'window.open("{url}","_blank");') + + urls = ['https://qq.com' for _ in range(100)] + list_window = self.browser.window_handles[0] + threading.Thread(target=_open_tab, args=(urls,), daemon=True).start() + time.sleep(2) + + while len(self.browser.window_handles) >= 1: + for i in self.browser.window_handles: + if i == list_window: + continue + + try: + _body = '\n'.join(i.text for i in self.browser.find_elements(By.CSS_SELECTOR, + 'body > div.global > div:nth-child(7) > div.col.col-2.fl > div > div.title.nst')) + _url = self.browser.current_url + _title = self.browser.title + self.browser.close() + except (NoSuchWindowException, NoSuchElementException): + continue + + if _body: + logger.info(f'获取正文成功:{_title} 长度:{len(_body)}') + else: + logger.error(f'获取正文失败:{_title}') + continue + + print(_body) + logger.info('Done.') + + async def open_tab(self, _urls): + for i, url in enumerate(_urls): + # url = url[0] + if len(self.browser.window_handles) >= 9 + 1: + logger.debug('sleep 0.1') + await asyncio.sleep(0.1) + continue + logger.info('Will Open %03d %s' % (i, url)) + self.browser.switch_to.window(self.browser.window_handles[-1]) + self.browser.execute_script(f'window.open("{url}","_blank");') + + async def get_body(self): + list_window = self.browser.window_handles[0] + + while len(self.browser.window_handles) >= 2: + for i in self.browser.window_handles: + logger.debug('await 切换') + await asyncio.sleep(0.001) + if i == list_window: + continue + try: + self.browser.switch_to.window(i) + _body = '\n'.join(i.text for i in self.browser.find_elements( + By.CSS_SELECTOR, + 'body > div.global > div:nth-child(7) > div.col.col-2.fl > div > div.title.nst')) + _title = self.browser.title + self.browser.close() + except (NoSuchWindowException, NoSuchElementException) as e: + logger.info(f'EE: {e}') + continue + + if _body: + logger.info(f'获取正文成功:{_title} 长度:{len(_body)}') + else: + logger.error(f'获取正文失败:{_title}') + continue + + print(_body) + logger.info('Done.') + + async def main(self): + task1 = asyncio.create_task( + self.open_tab(['https://qq.com' for _ in range(100)]) + ) + + task2 = asyncio.create_task( + self.get_body() + ) + await task1 + await task2 + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(module)s - %(levelname)s - %(message)s' + ) + _l = Learn() + asyncio.run(_l.main()) diff --git a/actions/down_txt/main.py b/actions/down_txt/main.py new file mode 100644 index 0000000..fb0180b --- /dev/null +++ b/actions/down_txt/main.py @@ -0,0 +1,21 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +""" +@File Name : main.py.py +@Author : LeeCQ +@Date-Time : 2023/5/29 21:04 +""" +import logging +from pathlib import Path + +from dotenv import load_dotenv + +from down_book import down_txt + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +load_dotenv(Path(__file__).parent / '.env') +down_txt('https://www.biququ.info/html/4593/') diff --git a/actions/down_txt/replace.py b/actions/down_txt/replace.py index 6aaa299..6fbbd93 100644 --- a/actions/down_txt/replace.py +++ b/actions/down_txt/replace.py @@ -18,10 +18,18 @@
br> ^第.*?章.*?\n +起点手游《吞噬星空》新区火爆 起点币月票ipadmini天天送 官网 tsxk\.qidian\.com(未完待续) +起点中文网www.欢迎广大书友光临阅读,最新、最快、最火的连载作品尽在起点原创! +番茄推荐一本好书《宦海风月》,历史架空类的,很不错啊,地址:http://www\./Book/1623706\.aspx +[\((\[【].*?qidian\.com.*?[\))\]】] +章节报错 +热门推荐:.*$ + """ REPLACE_DICT = { - 'se': '色', 'xian': '现', 'xiao': '小', 'xue': '雪', 'xun': '寻', 'xuan': '玄', 'yan': '眼' + '丢shi': '丢失', + 'se': '色', 'xian': '现', 'xiao': '小', 'xue': '雪', 'xun': '寻', 'xuan': '玄', 'yan': '眼', } __all__ = ['replace', 'replace_from_sql'] @@ -29,13 +37,18 @@ def replace(s: str): # remove - s = re.sub(r'\n+', '\n', s) + s = re.sub(r'\n+', '\n', s, flags=re.IGNORECASE) for line in REMOVE_TEXT.split('\n'): - s = re.sub(line, '', s) - + _s = s + s = re.sub(line, '', s, flags=re.IGNORECASE) + if _s != s: + logger.debug('Removed: %s', line) # replace for k, v in REPLACE_DICT.items(): + _s = s s = re.sub(k, v, s) + if _s != s: + logger.debug('Removed: %s', k) return s @@ -66,7 +79,7 @@ def replace_from_sql(sql: BaseSQLAPI, table: str = None): for title, body in _lines: new_body = replace(body) if body == new_body: - logger.info(f'{title} No change. continue.') + # logger.info(f'{title} No change. continue.') continue logger.info(f'Replace {_t} {title}') sql.write_db(f"update `{_t}` set body='{new_body}' where title='{title}'") @@ -75,5 +88,3 @@ def replace_from_sql(sql: BaseSQLAPI, table: str = None): continue else: logger.info('Replace all done.') - -