-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_download.py
26 lines (24 loc) · 965 Bytes
/
text_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import datetime
import psycopg2
import aiosql
import dcapi
# db-related configuration
conn = psycopg2.connect("")
stmts = aiosql.from_path("sql/py.sql", "psycopg2")
docs = stmts.get_doc_download_list(conn)
for d in docs:
now = datetime.datetime.now().strftime('%m-%d %H:%M:%S')
cnt, doc_id, pg_cnt, print_prefix = d
print(f'{cnt}, {now}, {doc_id}, {pg_cnt}')
for p in range(pg_cnt):
try:
pg_text = dcapi.download_page_text(f'{print_prefix}{p+1}.txt')
# prevent string literal cannot contain NUL (0x00) characters issue
pg_text = pg_text.replace('\x00', '')
word_cnt = len(pg_text.split())
char_cnt = len(pg_text)
stmts.add_page(conn, id=doc_id, pg=p+1, word_cnt=word_cnt,
char_cnt=char_cnt, body=pg_text)
except Exception as e:
print(e)
conn.commit() # for performance only commit after every doc