This repository has been archived by the owner on Feb 14, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetting_info_by_isbn.py
204 lines (163 loc) · 6.04 KB
/
getting_info_by_isbn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
This script returns information about a book by its ISBN with Google Books API.
How to run.
Example:
python3 getting_info_by_isbn.py 978-1-119-70711-0 9781394186921 9781394182930 9781119909378
general view:
<python> <path/to/script.py> <isbn1> <isbn2> ... <isbn100500>
example of isbn: "978-1-119-70711-0". It can be 10 digits or 13 digits. It can be with hyphens or without them.
"""
import csv
import json
import re
import sys
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from pprint import pprint
import requests
from text_to_num import alpha2digit
from logger import logger
logger.setLevel(level='INFO')
class Book:
def __init__(self):
pass
def print_info(self):
print()
info = self.__dict__
pprint(info, sort_dicts=False, underscore_numbers=True)
print()
def create_folder_name(self):
"""
:return: suggested folder name
"""
author = self.authors[0].split(' ')[-1]
tuple_ = (author, str(self.edition) + 'e', self.isbn) if self.edition is not None else (author, self.isbn)
folder = '_'.join(tuple_)
self.folder_name = folder
logger.debug(folder)
def search_edition(self, description: str):
"""------Searching edition----------"""
match = re.search(r'([A-Za-z0-9_-]*)\s+edition', description)
self.edition = match.group(1).lower() if match is not None else None
logger.debug('edition is %s', self.edition)
if self.edition is not None:
logger.debug(self.edition)
if self.edition == 'second':
self.edition = 2
elif self.edition == 'third':
self.edition = 3
else:
self.edition = alpha2digit(self.edition, 'en')
match_num = re.search('([0-9]*)', self.edition)
if match_num is not None:
num = match_num.group(1)
if len(num) != 0:
self.edition = int(num)
else:
self.edition = None
# ---------------------
def get_info_by_isbn(isbn: str):
url = 'https://www.googleapis.com/books/v1/volumes?q=isbn:' + isbn
response = requests.get(url, timeout=15)
if not bool(response):
msg = "Answer wasn't gotten. Response: " + response
raise Exception(msg)
json_all = response.json()
if json_all['totalItems'] == 0:
logger.warning('There is no information on server about book with ISBN %s. \nBook is skipped.', isbn)
return 0
if json_all['totalItems'] > 1:
logger.warning(
"Book with ISBN %s has several 'items'. Book is not skipped. \n"
'First item is taken, because usually items are identical.\n',
isbn,
)
# logger.debug('\n-------\n%s\n-------\n', json.dumps(json_all['items'], indent=2)) # to show all items inside
# return 0
data_all = json_all['items'][0]
info = data_all['volumeInfo']
# print all json
logger.debug('\n-------\n%s\n-------\n', json.dumps(data_all, indent=2))
book = Book()
def add_field(dict_: dict, field: str):
return dict_[field] if field in dict_ else None
book.isbn = isbn # or info['industryIdentifiers'][0 <or> 1]['identifier']
book.authors = info['authors']
book.title = info['title']
book.subtitle = add_field(info, 'subtitle')
book.publisher = info['publisher']
description = (info['description'] + ' ' + data_all['searchInfo']['textSnippet']).lower().replace('canadian', '')
logger.debug(description)
book.search_edition(description)
book.category = info['categories'][0]
book.published_date = info['publishedDate']
if 'pageCount' in info and info['pageCount'] > 0:
book.page_count = info['pageCount']
else:
book.page_count = add_field(info, 'printedPageCount')
book.text_snippet = data_all['searchInfo']['textSnippet']
book.thumbnail = info['imageLinks']['thumbnail']
book.info_link = info['infoLink']
book.full_json = data_all['selfLink']
book.rating = add_field(info, 'averageRating')
book.ratings_count = add_field(info, 'ratingsCount')
book.create_folder_name()
if logger.level == 'DEBUG':
book.print_info()
return book
def create_tsv(results: list):
books = [result for result in results if result != 0]
output = Path('output')
output.mkdir(exist_ok=True)
path = output / 'Products.tsv'
with (
open(path, 'w+') as file,
# remove this and related lines (`# Sybex`) below if you use script for other purposes
open('Tickets_Sybex.tsv') as file_tickets, # Sybex
):
writer = csv.writer(file, delimiter='\t')
header = [
'isbn',
'published',
'edition (supposed)',
'pages',
'publisher',
'authors',
'title',
'subtitle',
'category',
'full JSON',
'info link',
'# of tickets',
]
tickets = ' '.join(file_tickets.readlines()) # Sybex
writer.writerow(header)
for book in books:
last6 = book.isbn[-6:] # Sybex
n = tickets.count(last6) # Sybex
row = [
book.isbn,
book.published_date,
book.edition,
book.page_count,
book.publisher,
', '.join(book.authors),
book.title,
book.subtitle,
book.category,
book.full_json,
book.info_link,
n, # Sybex
]
writer.writerow(row)
def main(args: list):
logger.info('Work started')
with ThreadPoolExecutor() as pool:
results = pool.map(get_info_by_isbn, args)
create_tsv(results)
logger.info('Work completed')
if __name__ == '__main__':
if len(sys.argv) == 1:
msg = 'Not enough arguments. Pass ISBN(s) as arguments.'
raise Exception(msg)
main(sys.argv[1:])