-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcls-compile.py
566 lines (434 loc) · 23.4 KB
/
cls-compile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
"""A tool for compiling the proceedings of the Chicago Linguistic Society
<http://chicagolinguisticsociety.org/>.
Download, documentation etc: <https://github.com/jacksonllee/cls-proceedings>
Author: Jackson Lee <[email protected]>
Last updated on 2017-09-18
"""
from __future__ import print_function
import sys
import argparse
import os
import csv
import subprocess
import platform
import logging
from PyPDF2 import (PdfFileWriter, PdfFileReader)
# --------------------------------------------------------------------------- #
# set up logging
MASTER_LOG_NAME = "master.log"
MASTER_LOGGER = logging.getLogger(__name__)
MASTER_LOGGER.setLevel(logging.INFO)
file_handler = logging.FileHandler(MASTER_LOG_NAME)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s: %(message)s')
file_handler.setFormatter(formatter)
MASTER_LOGGER.addHandler(file_handler)
# --------------------------------------------------------------------------- #
# some handy functions
def read_csv(fname, delim=',;\t| '):
"""Read a CSV file (n rows, m columns) with filename *fname*,
and return a list containing n lists each containing m strings.
"""
with open(fname, 'rU') as csvfile:
dialect = csv.Sniffer().sniff(csvfile.read(), delimiters=delim)
csvfile.seek(0)
data = csv.reader(csvfile, dialect)
return list(data)
def ensure_empty_dir(abs_dir_path):
"""Ensure that *abs_dir_path* is an empty folder."""
if os.path.isdir(abs_dir_path):
# delete everything inside the folder
existing_filenames = os.listdir(abs_dir_path)
for filename_ in existing_filenames:
filename_abs_path = os.path.join(abs_dir_path, filename_)
os.remove(filename_abs_path)
else:
# create the empty folder
os.makedirs(abs_dir_path)
# --------------------------------------------------------------------------- #
# set up this script's information
__author__ = 'Jackson Lee'
__author_email__ = '[email protected]'
__url__ = 'https://github.com/jacksonllee/cls-proceedings'
__longdescr__ = """\
Compiling the proceedings of the Chicago Linguistic Society\n
Download, documentation etc: <{}>\n
Author: {} <{}>""".format(__url__, __author__, __author_email__)
# --------------------------------------------------------------------------- #
# parse command line arguments
parser = argparse.ArgumentParser(
description=__longdescr__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--directory', type=str, default='example',
help='working directory, where all necessary files and'
' subdirectories are')
parser.add_argument('--frontmatter', type=str, default='front-matter',
help='directory name for front matter')
parser.add_argument('--acknowledgments', type=str, default='acknowledgments',
help='directory name for acknowledgments')
parser.add_argument('--papers', type=str, default='papers-without-headers',
help='directory name for papers without headers')
parser.add_argument('--templates', type=str, default='templates',
help='directory name for templates (for headers etc)')
parser.add_argument('--toc', type=str, default='table-of-contents',
help='directory name for table of contents')
parser.add_argument('--headers', type=str, default='headers',
help='directory name for headers of papers')
parser.add_argument('--papersfinal', type=str, default='papers-with-headers',
help='directory name for papers with headers')
parser.add_argument('--organizer', type=str, default='organizer.csv',
help='filename of the organizer CSV file')
parser.add_argument('--maxheaderlength', type=int, default=55,
help='maximum number of characters in a header')
parser.add_argument('--output', type=str, default='proceedings.pdf',
help='filename of the final proceedings pdf output '
'(size 8.5" x 11")')
parser.add_argument('--output6by9', type=str, default='proceedings6by9.pdf',
help='filename of the final proceedings pdf output '
'(size 6" x 9")')
parser.add_argument('--scale', type=float, default=0.95,
help='Scale factor for the 6" x 9" output based on the '
'8.5" x 11". Try a slightly smaller value if '
'material is still cut off at the margins.')
parser.add_argument('--startpagenumber', type=int, default=1,
help='the starting page number of the first paper by order'
'in the volume')
command_line_args = parser.parse_args()
front_matter_dir = command_line_args.frontmatter
acknowledgments_dir = command_line_args.acknowledgments
papers_dir = command_line_args.papers
templates_dir = command_line_args.templates
toc_dir = command_line_args.toc
headers_dir = command_line_args.headers
papersfinal_dir = command_line_args.papersfinal
organizer_name = command_line_args.organizer
max_header_length = command_line_args.maxheaderlength
proceedings_pdf_filename = command_line_args.output
proceedings_pdf_6by9_filename = command_line_args.output6by9
scale_factor = command_line_args.scale
start_page_number = command_line_args.startpagenumber
working_dir = os.path.abspath(command_line_args.directory)
if not os.path.isdir(working_dir):
raise RuntimeError('The directory {} does not exist.'.format(working_dir))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Defining log filenames and file objects')
PDFLATEX_LOG_NAME = 'pdflatex.log'
DIRECTORY_LOG_NAME = 'directory.log'
PDFLATEX_LOG = open(PDFLATEX_LOG_NAME, 'w')
DIRECTORY_LOG = open(DIRECTORY_LOG_NAME, 'w')
def print_directory_log(msg):
print(msg, file=DIRECTORY_LOG)
for dir_, _, filenames in os.walk(working_dir):
for filename in filenames:
print(os.path.join(dir_, filename), file=DIRECTORY_LOG)
print(file=DIRECTORY_LOG)
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Logging directory contents before PDF manipulation')
print_directory_log('Before any PDF manipulation:')
# --------------------------------------------------------------------------- #
# print basic info/metadata to master log
MASTER_LOGGER.info('System: {}'.format(platform.system()))
MASTER_LOGGER.info('Node: {}'.format(platform.node()))
MASTER_LOGGER.info('Release: {}'.format(platform.release()))
MASTER_LOGGER.info('Version: {}'.format(platform.version()))
MASTER_LOGGER.info('Machine: {}'.format(platform.machine()))
MASTER_LOGGER.info('Processor: {}'.format(platform.processor()))
MASTER_LOGGER.info('Python version: {}'.format(platform.python_version()))
MASTER_LOGGER.info('sys.argv: {}'.format(sys.argv))
MASTER_LOGGER.info('Your working directory: {}'.format(working_dir))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Reading the organizer CSV file')
organizer_path = os.path.join(working_dir, organizer_name)
if not os.path.isfile(organizer_path):
raise RuntimeError('The organizer "{}" is not found.'
.format(organizer_path))
organizer = read_csv(organizer_path)
number_of_papers = len(organizer) - 1
organizer_headers = [x.lower().strip() for x in organizer[0]]
expected_organizer_headers = ['index', 'authors', 'paper title',
'authors in header', 'paper title in header',
'paper filename']
header_to_index = dict()
# key: a header in the organizer CSV (e.g., 'index', 'author', 'paper title')
# value: the index of that header in the list expected_organizer_headers
for expected_header in expected_organizer_headers:
try:
header_to_index[expected_header] = \
organizer_headers.index(expected_header)
except ValueError:
raise RuntimeError("The header {} is not found in the organizer CSV "
"file.".format(expected_header))
authors_list = [row[header_to_index['authors']] for row in organizer[1:]]
paper_title_list = [row[header_to_index['paper title']]
for row in organizer[1:]]
authors_in_header_list = [row[header_to_index['authors in header']]
for row in organizer[1:]]
paper_title_in_header_list = [row[header_to_index['paper title in header']]
for row in organizer[1:]]
paper_filename_list = [row[header_to_index['paper filename']]
for row in organizer[1:]]
MASTER_LOGGER.info('The organizer is: {}'.format(organizer_path))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Checking if any author or paper tile headers are too '
'long')
error_template = 'The header "{}" for paper {} is longer than {} characters.'
for i in range(number_of_papers):
authors_in_header = authors_in_header_list[i]
paper_title_in_header = paper_title_in_header_list[i]
# if no header specified, use the full author/title name
if not authors_in_header:
authors_in_header = authors_list[i]
authors_in_header_list[i] = authors_in_header
if not paper_title_in_header:
paper_title_in_header = paper_title_list[i]
paper_title_in_header_list[i] = paper_title_in_header
MASTER_LOGGER.info(authors_in_header)
MASTER_LOGGER.info(paper_title_in_header)
# max_header_length can be set at command line arguments
if len(authors_in_header) > max_header_length:
raise RuntimeError(error_template.format(authors_in_header, i + 1,
max_header_length))
if len(paper_title_in_header) > max_header_length:
raise RuntimeError(error_template.format(paper_title_in_header, i + 1,
max_header_length))
# --------------------------------------------------------------------------- #
# For front matter and acknowledgments, I intentionally use lists here
# (front_matter_filenames and acknowledgments_filenames) to allow the
# possibility that more than one PDFs---as opposed to just exactly one---may
# be allowed for whatever reason. In that case, care should be taken for
# where the blank pages should be inserted (cf. towards the end of the code
# below).
MASTER_LOGGER.info('Checking if the front matter file is present')
front_matter_abs_dir = os.path.join(working_dir, front_matter_dir)
if not os.path.isdir(front_matter_abs_dir):
raise RuntimeError("The front matter directory {} does not exist."
.format(front_matter_abs_dir))
front_matter_filenames = sorted([x for x in os.listdir(front_matter_abs_dir)
if x.lower().endswith('.pdf')])
if len(front_matter_filenames) == 0:
raise RuntimeError('No front matter pdf is found at {}'
.format(front_matter_abs_dir))
if len(front_matter_filenames) > 1:
raise RuntimeError('More than 1 pdf file is found at {}\n'
'(Only 1 pdf is allowed.)'
.format(front_matter_abs_dir))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Checking if acknowledgments pdf is present')
acknowledgments_abs_dir = os.path.join(working_dir, acknowledgments_dir)
if not os.path.isdir(acknowledgments_abs_dir):
raise RuntimeError('The acknowledgments directory {} does not exist.'
.format(acknowledgments_abs_dir))
acknowledgments_filenames = sorted([x for x in
os.listdir(acknowledgments_abs_dir)
if x.lower().endswith('.pdf')])
if len(acknowledgments_filenames) == 0:
raise RuntimeError('No acknowledgments pdf is found at {}'
.format(acknowledgments_abs_dir))
if len(acknowledgments_filenames) > 1:
raise RuntimeError('More than 1 pdf file is found at {}\n'
'(Only 1 pdf is allowed.)'
.format(acknowledgments_abs_dir))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Checking if templates files are present')
templates_abs_dir = os.path.join(working_dir, templates_dir)
if not os.path.isdir(templates_abs_dir):
raise RuntimeError('The templates directory {} does not exist.'
.format(templates_abs_dir))
headers_template_path = os.path.join(templates_abs_dir, 'headers.tex')
blank_page_path = os.path.join(templates_abs_dir, 'blank.pdf')
toc_template_path = os.path.join(templates_abs_dir, 'table-of-contents.tex')
if not os.path.isfile(headers_template_path):
raise RuntimeError('Expected template file not found -- {}'
.format(headers_template_path))
if not os.path.isfile(blank_page_path):
raise RuntimeError('Expected template file not found -- {}'
.format(blank_page_path))
if not os.path.isfile(toc_template_path):
raise RuntimeError('Expected template file not found -- {}'
.format(toc_template_path))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Checking if all pdf papers are present, '
'and getting number of pages for each paper')
number_of_pages_list = list() # list of int
page_range_list = list() # list of (int, int)
cumulative_start_page = start_page_number
current_paper_start_page = start_page_number
for paper_filename in paper_filename_list:
paper_path = os.path.join(working_dir, papers_dir, paper_filename)
if not os.path.isfile(paper_path):
raise RuntimeError('The file "{}" is not found in {}.\nCheck if '
'actual filenames match those in the CSV organizer.'
.format(paper_filename,
os.path.join(working_dir, papers_dir)))
pdf_object = PdfFileReader(open(paper_path, 'rb'))
number_of_pages = pdf_object.getNumPages()
end_page = current_paper_start_page + number_of_pages - 1
number_of_pages_list.append(number_of_pages)
page_range_list.append((current_paper_start_page, end_page))
cumulative_start_page += number_of_pages
if cumulative_start_page % 2: # if cumulative_start_page is an odd number
pass
# The current paper ends on the *left*-hand side in the printed volume.
# The next paper will start immediately afterwards (no blank page
# insertion needed), on the right-hand side in the printed volume.
else:
cumulative_start_page += 1
# The paper ends on the *right*-hand side in the printed volume.
# We will need to insert a blank page right after this page,
# so that the next paper starts on the right-hand side.
# In terms of page number tracking, the next paper will skip one page
# (= the blank page), and therefore we need to increment
# cumulative_start_page by 1.
current_paper_start_page = cumulative_start_page
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info("Creating headers' latex files and "
"generating the headers' pdfs")
headers_abs_dir = os.path.join(working_dir, headers_dir)
ensure_empty_dir(headers_abs_dir)
header_template_str = open(headers_template_path).read()
for i in range(number_of_papers):
# noinspection PyRedeclaration
latex_str = header_template_str
authors_in_header = authors_in_header_list[i].upper() # all uppercase
paper_title_in_header = paper_title_in_header_list[i].upper() # uppercase
number_of_pages = number_of_pages_list[i]
current_paper_start_page, end_page = page_range_list[i]
page_range_str = '{}-{}'.format(current_paper_start_page, end_page)
insert_pages_str = '\\newpage\n\n\\mbox{}\n' * (number_of_pages - 1)
headers_latex_filename = 'headers{}.tex'.format(i)
latex_str = latex_str.replace('XXStartPageXX',
str(current_paper_start_page))
latex_str = latex_str.replace('XXAuthorsXX', authors_in_header)
latex_str = latex_str.replace('XXTitleXX', paper_title_in_header)
latex_str = latex_str.replace('XXPageRangeXX', page_range_str)
latex_str = latex_str.replace('XXInsertPagesXX', insert_pages_str)
output_latex_path = os.path.join(headers_abs_dir, headers_latex_filename)
with open(output_latex_path, 'w') as f:
f.write(latex_str)
subprocess.call(('pdflatex', '-output-directory', headers_abs_dir,
output_latex_path), stdout=PDFLATEX_LOG)
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Creating paper pdfs with headers')
papersfinal_abs_dir = os.path.join(working_dir, papersfinal_dir)
ensure_empty_dir(papersfinal_abs_dir)
for i, paper_filename in enumerate(paper_filename_list):
paper_filename_abs_path = os.path.join(working_dir, papers_dir,
paper_filename)
headers_abs_path = os.path.join(headers_abs_dir, 'headers{}.pdf'.format(i))
output_pdf = PdfFileWriter()
paper_pdf = PdfFileReader(open(paper_filename_abs_path, 'rb'))
headers_pdf = PdfFileReader(open(headers_abs_path, 'rb'))
number_of_pages = number_of_pages_list[i]
# align the paper pdf and the headers pdf and merge (="superimpose") them
for j in range(number_of_pages):
page = paper_pdf.getPage(j)
page.mergePage(headers_pdf.getPage(j))
output_pdf.addPage(page)
output_pdf_abs_path = os.path.join(papersfinal_abs_dir, paper_filename)
output_pdf.write(open(output_pdf_abs_path, 'wb'))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Creating the table of contents')
toc_abs_dir = os.path.join(working_dir, toc_dir)
ensure_empty_dir(toc_abs_dir)
toc_template = open(toc_template_path).read()
toc_entry_template = '\\TocEntry{{{}}}{{{}}}{{{}}}\n\n'
toc_entries_str = ''
for i in range(number_of_papers):
authors = authors_list[i]
paper_title = paper_title_list[i]
current_paper_start_page = page_range_list[i][0]
toc_entries_str += toc_entry_template.format(authors, paper_title,
current_paper_start_page)
toc_template = toc_template.replace('XXInsertTocEntriesXX', toc_entries_str)
output_toc_tex_path = os.path.join(toc_abs_dir, 'table-of-contents.tex')
with open(output_toc_tex_path, 'w') as f:
f.write(toc_template)
subprocess.call(('pdflatex', '-output-directory', toc_abs_dir,
output_toc_tex_path), stdout=PDFLATEX_LOG)
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info("""Working directory: {}
Creating the final proceedings pdf output
Input pdf files are concatenated in the following order.
(Blank pages are automatically added if necessary.)""".format(working_dir))
proceedings_pdf = PdfFileWriter()
proceedings_pdf_abs_path = os.path.join(working_dir, proceedings_pdf_filename)
cumulative_page_count = 0
blank_page_pdf = PdfFileReader(open(blank_page_path, 'rb'))
def add_files(category, filenames_, input_abs_dir):
"""
Handle pdf files for *category* (str).
Input pdf files are in *input_abs_dir* (str)
*filenames* gives the list of filenames relative to *input_abs_dir*.
"""
global proceedings_pdf
global cumulative_page_count
global blank_page_pdf
MASTER_LOGGER.info('(For {})'.format(category))
for filename_ in filenames_:
input_pdf_path = os.path.join(input_abs_dir, filename_)
MASTER_LOGGER.info('\t' + os.path.relpath(input_pdf_path, working_dir))
input_pdf = PdfFileReader(open(input_pdf_path, 'rb'))
input_number_of_pages = input_pdf.getNumPages()
proceedings_pdf.appendPagesFromReader(input_pdf)
cumulative_page_count += input_number_of_pages
# check if blank page insertion is needed
if cumulative_page_count % 2: # if odd number
cumulative_page_count += 1
proceedings_pdf.appendPagesFromReader(blank_page_pdf)
add_files('front matter', front_matter_filenames, front_matter_abs_dir)
add_files('acknowledgments', acknowledgments_filenames,
acknowledgments_abs_dir)
add_files('table of contents', ['table-of-contents.pdf'], toc_abs_dir)
add_files('papers', paper_filename_list, papersfinal_abs_dir)
proceedings_pdf.write(open(proceedings_pdf_abs_path, 'wb'))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Creating the 6" x 9" final proceedings PDF')
proceedings_pdf_85by11 = PdfFileReader(open(proceedings_pdf_abs_path, 'rb'))
page1 = proceedings_pdf_85by11.getPage(0)
original_page_width, original_page_height = page1.mediaBox.getUpperRight()
new_page_width = original_page_width * scale_factor
new_page_height = original_page_height * scale_factor
page_width_delta = (original_page_width - new_page_width) / 2
page_height_delta = (original_page_height - new_page_height) / 2
trim_from_left_right_margins = (original_page_width * (1.25 / 8.5) -
page_width_delta)
trim_from_top_margin = original_page_height * (0.67 / 11) - page_height_delta
trim_from_bottom_margin = (original_page_height * (1.33 / 11) -
page_height_delta)
new_lower_left = (trim_from_left_right_margins, trim_from_bottom_margin)
new_upper_right = (new_page_width - trim_from_left_right_margins,
new_page_height - trim_from_top_margin)
proceedings_pdf_6by9 = PdfFileWriter()
proceedings_pdf_6by9_abs_path = os.path.join(working_dir,
proceedings_pdf_6by9_filename)
for i in range(proceedings_pdf_85by11.getNumPages()):
page = proceedings_pdf_85by11.getPage(i)
page.scaleBy(scale_factor)
page.mediaBox.lowerLeft = new_lower_left
page.mediaBox.upperRight = new_upper_right
proceedings_pdf_6by9.addPage(page)
proceedings_pdf_6by9.write(open(proceedings_pdf_6by9_abs_path, 'wb'))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('All done!! Please find these in the working directory:\n'
'\tSize 8.5" x 11": {}\n'
'\tSize 6" x 9": {}\n'
.format(os.path.basename(proceedings_pdf_abs_path),
os.path.basename(proceedings_pdf_6by9_abs_path)))
MASTER_LOGGER.info("Other output files are in the following subfolders:\n"
"{}\n{}\n{}"
.format(os.path.relpath(papersfinal_abs_dir, working_dir),
os.path.relpath(toc_abs_dir, working_dir),
os.path.relpath(headers_abs_dir, working_dir)))
MASTER_LOGGER.info('Log files:')
MASTER_LOGGER.info('{} (this current file)'.format(MASTER_LOG_NAME))
MASTER_LOGGER.info('{} (log by the "pdflatex" command)'
.format(PDFLATEX_LOG_NAME))
MASTER_LOGGER.info('{} (all contents at the working directory)'
.format(DIRECTORY_LOG_NAME))
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Printing the directory tree after final PDf compilation')
print_directory_log('After completion of final PDF compilation:')
# --------------------------------------------------------------------------- #
MASTER_LOGGER.info('Closing log files for pdflatex and directory tree')
PDFLATEX_LOG.close()
DIRECTORY_LOG.close()