-
Notifications
You must be signed in to change notification settings - Fork 0
/
docusaurus_nb.py
102 lines (77 loc) · 3.29 KB
/
docusaurus_nb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# coding: utf-8
import glob
import os
import pypandoc
import docusaurus_config as config
import re
import subprocess
join = os.path.join
ODT_path = config.ODT_path
DOCX_path = config.DOCX_path
PDF_path = config.PDF_path
MD_path = config.MD_path
os.makedirs(ODT_path, exist_ok=True)
os.makedirs(DOCX_path, exist_ok=True)
os.makedirs(PDF_path, exist_ok=True)
os.makedirs(MD_path, exist_ok=True)
def filename(path):
return os.path.basename(path.rsplit('.')[0])
# **Convert all odt to docx with pandoc**
for file in glob.glob(join(ODT_path, '*')):
outfile = join(DOCX_path, filename(file) + '.docx')
pypandoc.convert_file(file, 'docx', outputfile=outfile)
# **Convert all docx to md with mammoth**
def exclamation(command):
return subprocess.run(command, check=True, text=True, shell=True)
for file in glob.glob(join(DOCX_path, '*')):
out_dir = join(MD_path, filename(file).replace(' ', '_'))
os.makedirs(out_dir, exist_ok=True)
exclamation("mammoth --output-format=markdown '{file}' \
--output-dir='{out_dir}'".format(file=file, out_dir=out_dir))
print(out_dir)
# * **copy the html file from each folder**
# * **change the image paths**
# * **add page titles**
# * **write to a new file at parent folder**
def get_regex_pattern(string):
# all patterns must match an entire expression and the number, in that order
name = re.search(r'([a-zA-Z0-9\-]+)', string)[0]
# matches the infix anotated titles
pattern_A = r'([\*_]{0,5}%s.*?\((?:[\*_]){0,5}\d{4}(?:\\?-[0-9]{4})?\D*?(?:[\*_]){0,5}\D*?:\D*?(?:[\*_]){0,4}\D*?([0-9]+(?:\s{0,2}\\?(?:-|y|,)\s{0,2}[0-9]+){0,3})\D*?\))' % name
# matches the postfix anotated titles
pattern_B = r'([\*_]{0,5}%s(?:.*?\n{0,3}){0,3}[pP]\\?\s{0,3}\.\s{0,3}([0-9]+(?:\\?-[0-9]+){0,3}))' % name
return pattern_A if re.search(pattern_A, string) else pattern_B
# for some reason when mammoth exports md to a directory
# it writes it to a html
# copy the html file from each folder
for file in glob.glob(join(MD_path, '*', '*.html')):
with open(file, 'r') as input_file:
content = input_file.read()
# change the image paths
# get parent folder name
parent_folder = file.split('/')[-2]
content = re.sub(r'(!\[.*?\]\()(.*?)(\))', r'\1{}/\2\3'.format(parent_folder), content)
# remove mammoth given headers
content = re.sub(r'<a.*?><\/a>## (.*)', r'### \1', content)
# mitigate improperly bolded words
# starting late
content = re.sub(r' ([a-zA-Z])__([a-zA-Z]+)', r' __\1\2', content)
# ending early
content = re.sub(r'([a-zA-Z]+)__([a-zA-Z]) ', r'\1\2__', content)
# the colons seem to be annoying the parser
content = re.sub(r'__:', r':__', content)
# removes double minus
content = re.sub(r'(\d)\\-\\-(\d)', r'\1-\2', content)
# add page titles
flags = re.IGNORECASE
pattern = get_regex_pattern(content)
content = re.sub(pattern, r'\n## \2\n\1', content, flags=flags)
# remove heading white spaces
# content = content.lstrip()
# write to a new
output_path = os.path.dirname(file).replace('_', ' ') + '.md'
title = os.path.basename(os.path.dirname(file)).replace('_', ' ')
with open(output_path, 'w+') as output_file:
output_file.write('# ' + title + '\n\n')
output_file.write(content)