Skip to content

Commit a0187b7

Browse files
committed
clean up code
1 parent 1a2e954 commit a0187b7

File tree

2 files changed

+78
-73
lines changed

2 files changed

+78
-73
lines changed

endnote_convert.py

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,59 @@
1-
import sys
2-
import os.path
31
import re
4-
import string
52
from bs4 import BeautifulSoup
63

74
# HTM(L) full text file aka converted docx file
85
input_file = input("HTM(L) file name (e.g. myText.htm):\n")
96

7+
108
def http_get(url):
11-
fp = open(url, 'r')
12-
return BeautifulSoup(fp, 'html.parser')
9+
fp = open(url, 'r')
10+
return BeautifulSoup(fp, 'html.parser')
11+
1312

1413
# this function does all the work
1514
# find superscript numbers and convert them to endnote readable unformatted references
1615
def replace_sup(file):
17-
for i in file.find_all('sup'):
18-
# single refs like 1 or 12
19-
regex1 = r'yes">(\,\s){0,1}(\d+)</span>'
20-
# a group of refs like 1, 2, 34
21-
regex2 = r'yes">((\d+\,\s)+(\d+))</span>'
22-
# range of refs like 1-4
23-
regex3 = r'yes">((\d+)(\-)(\d+))</span>'
24-
match1 = re.search(regex1, str(i), re.MULTILINE)
25-
match2 = re.search(regex2, str(i), re.MULTILINE)
26-
match3 = re.search(regex3, str(i), re.MULTILINE)
27-
if match1:
28-
m = str(match1.group(2))
29-
n_m = re.sub(r'(\d+)', '{#' + r'\1' + '}', m)
30-
n_m = n_m.replace(' ', '')
31-
elif match2:
32-
m = str(match2.group(1))
33-
n_m = re.sub(r'(\d+)', '{#' + r'\1' + '}', m)
34-
n_m = n_m.replace(', ', '')
35-
elif match3:
36-
m = str(match3.group(1))
37-
first = int(match3.group(2))
38-
last = int(match3.group(4))
39-
r = ''
40-
# fill in the range of refs, because endnote likes to have {#1}{#2}{#3}{#4}
41-
while (first <= last):
42-
a = '{#' + str(first) + '}'
43-
r = r + a
44-
first = int(first) + 1
45-
n_m = re.sub(r'(\d+)(\-)(\d+)', r, m)
46-
else:
47-
continue
48-
i.replace_with(n_m)
49-
return file
16+
for i in file.find_all('sup'):
17+
# single refs like 1 or 12
18+
regex1 = r'yes">(\,\s){0,1}(\d+)</span>'
19+
# a group of refs like 1, 2, 34
20+
regex2 = r'yes">((\d+\,\s)+(\d+))</span>'
21+
# range of refs like 1-4
22+
regex3 = r'yes">((\d+)(\-)(\d+))</span>'
23+
match1 = re.search(regex1, str(i), re.MULTILINE)
24+
match2 = re.search(regex2, str(i), re.MULTILINE)
25+
match3 = re.search(regex3, str(i), re.MULTILINE)
26+
if match1:
27+
m = str(match1.group(2))
28+
n_m = re.sub(r'(\d+)', '{#' + r'\1' + '}', m)
29+
n_m = n_m.replace(' ', '')
30+
elif match2:
31+
m = str(match2.group(1))
32+
n_m = re.sub(r'(\d+)', '{#' + r'\1' + '}', m)
33+
n_m = n_m.replace(', ', '')
34+
elif match3:
35+
m = str(match3.group(1))
36+
first = int(match3.group(2))
37+
last = int(match3.group(4))
38+
r = ''
39+
# fill in the range of refs, because endnote likes to have {#1}{#2}{#3}{#4}
40+
while (first <= last):
41+
a = '{#' + str(first) + '}'
42+
r = r + a
43+
first = int(first) + 1
44+
n_m = re.sub(r'(\d+)(\-)(\d+)', r, m)
45+
else:
46+
continue
47+
i.replace_with(n_m)
48+
return file
49+
5050

5151
# main code
5252
if(input_file == ''):
53-
print('Please provide a file')
53+
print('Please provide a file')
5454
else:
5555
soup = http_get(input_file)
5656
soup = replace_sup(soup)
5757
with open(input_file + "_modified.htm", "wb") as f_output:
58-
f_output.write(soup.prettify("utf-8"))
59-
print('References exchanged. Ready to open the modified HTM file in Microsoft Word.')
58+
f_output.write(soup.prettify("utf-8"))
59+
print('References exchanged. Ready to open the modified HTM file in Microsoft Word.')

scrape_refs.py

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
1-
import sys
2-
import os.path
31
import re
4-
import string
52
import requests
63
import random
74
from bs4 import BeautifulSoup
8-
from urllib.parse import quote
5+
96

107
# iterations for random search
118
iterations = 20
@@ -20,7 +17,8 @@
2017
# define some variables
2118
# pretend to be a firefox browser
2219
SESSION = requests.Session()
23-
SESSION.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
20+
SESSION.headers.update(
21+
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
2422

2523
URL_SEARCH = 'https://pubmed.ncbi.nlm.nih.gov/?sort=date&term={q}'
2624
URL_CITE = 'https://pubmed.ncbi.nlm.nih.gov/{ident}/?format=pubmed'
@@ -36,12 +34,14 @@ def http_get(url):
3634
return BeautifulSoup(r.text, features="html.parser")
3735

3836
# split reference into authors, year and title
37+
38+
3939
def split_reference(reference):
4040
r = reference
4141
left = ''
4242
right = ''
4343
year = ''
44-
#find (YEAR)
44+
# find (YEAR)
4545
regex = r'\(\d\d\d\d\)'
4646
match = re.search(regex, str(r))
4747
if match:
@@ -56,40 +56,42 @@ def split_reference(reference):
5656
return 'Error no YEAR found'
5757
# where is YEAR in reference? and how long is reference?
5858
pos_y = r.find(year)
59-
l = len(r)
59+
le = len(r)
6060
# if YEAR somewhere in the middle of reference, split into left-part YEAR right-part
61-
if (l - pos_y >= 15):
61+
if (le - pos_y >= 15):
6262
left, right = r[:pos_y], r[pos_y+6:]
6363
# else split on 'et al' into left-part 'et al' right-part
64-
elif ((l - pos_y <= 15) and (r.find('et al'))!= -1):
64+
elif ((le - pos_y <= 15) and (r.find('et al')) != -1):
6565
rs = r.split('et al')
6666
left = rs[0]
6767
right = rs[1]
6868
# else find the third full-stop from the end and split there into left-part full-stop right-part
69-
else:
69+
else:
7070
rs = r.strip('.').rsplit('.', 2)
7171
left = rs[0]
7272
right = rs[1]
7373

74-
# clean up
75-
right = right.replace(year,'')
76-
right = right.replace(' ',' ')
74+
# clean up
75+
right = right.replace(year, '')
76+
right = right.replace(' ', ' ')
7777
right = right.strip()
78-
left = left.replace(' ',' ')
78+
left = left.replace(' ', ' ')
7979
left = left.strip()
80-
year = year.replace('(','')
81-
year = year.replace(')','')
80+
year = year.replace('(', '')
81+
year = year.replace(')', '')
8282
split_ref = [left, year, right]
8383
return split_ref
8484

8585
# last fallback if no search quesries find a reference: use random word combinations from the title
86-
def choose_random(reference, l=3):
86+
87+
88+
def choose_random(reference, leng=3):
8789
q = reference.split()
8890
i = 1
8991
r = []
9092
while i < 6:
9193
rand = random.randint(1, len(q)-1)
92-
if len(q[rand]) > l:
94+
if len(q[rand]) > leng:
9395
if (q[rand] in r):
9496
continue
9597
else:
@@ -100,6 +102,8 @@ def choose_random(reference, l=3):
100102
return r
101103

102104
# search for Pubmed ID in results page
105+
106+
103107
def get_articles(query):
104108
url = URL_SEARCH.format(q=query)
105109
soup = http_get(url)
@@ -114,18 +118,20 @@ def get_articles(query):
114118
return pubmed
115119

116120
# use Pubmed ID to create URL and copy entry in NML format
121+
122+
117123
def get_citations(ident, resolve=True):
118124
url = URL_CITE.format(ident=ident)
119125
soup = http_get(url)
120-
citations =''
126+
citations = ''
121127
for tag in soup.findAll(id="article-details"):
122128
citations = tag.string.strip()
123129
return citations
124130

125131

126132
# main code
127133
# open input and output fines
128-
if(ref_file ==''):
134+
if(ref_file == ''):
129135
print('Please provide a file')
130136
quit()
131137

@@ -149,8 +155,8 @@ def get_citations(ident, resolve=True):
149155
if (reference == ''):
150156
continue
151157

152-
print ("\n---------------------------------------------------------")
153-
print ("Doing reference:", reference)
158+
print("\n---------------------------------------------------------")
159+
print("Doing reference:", reference)
154160

155161
# split the reference into author, year, title
156162
query = split_reference(reference)
@@ -161,49 +167,48 @@ def get_citations(ident, resolve=True):
161167

162168
# find article by author and title
163169
r = get_articles(q[0] + ' ' + q[2])
164-
print ("Query: " + q[0] + ' ' + q[2])
170+
print("Query: " + q[0] + ' ' + q[2])
165171

166172
# find article by author and year
167173
if len(r) == 0:
168174
r = get_articles(q[0] + ' ' + q[1])
169-
print ("No results -- trying: " + q[0] + ' ' + q[1])
175+
print("No results -- trying: " + q[0] + ' ' + q[1])
170176

171177
# find article by year and title
172178
if len(r) == 0:
173179
r = get_articles(q[1] + ' ' + q[2])
174-
print ("Still no results -- trying: " + q[1] + ' ' + q[2])
180+
print("Still no results -- trying: " + q[1] + ' ' + q[2])
175181

176182
# find article by author year and title
177183
if len(r) == 0:
178184
r = get_articles(q[0] + ' ' + q[1] + ' ' + q[2])
179-
print ("Still no results -- trying: " + q[0] + ' ' + q[1] + ' ' + q[2])
185+
print("Still no results -- trying: " + q[0] + ' ' + q[1] + ' ' + q[2])
180186

181187
# find article by author year and random words from title
182188
if len(r) == 0:
183189
its = 0
184190
while its < iterations:
185191
q2 = choose_random(reference)
186-
print ("Still no results -- trying again with random words: ", q[0] + q[1] + q2)
192+
print("Still no results -- trying again with random words: ", q[0] + q[1] + q2)
187193
r = get_articles(q[0] + ' ' + q[1] + ' ' + q2)
188194
if len(r) != 0:
189195
break
190196
its += 1
191197

192198
if len(r) == 0:
193-
print ("Still no results -- skipping")
199+
print("Still no results -- skipping")
194200
skipped += 1
195201
skipped_refs = skipped_refs + '\n' + reference
196202
continue
197-
print ("Result written")
203+
print("Result written")
198204
myFile.write(get_citations(r) + '\n\n')
199205

200206
if skipped > 0:
201-
print ("\n---------------------------------------------------------")
207+
print("\n---------------------------------------------------------")
202208
print("Total number of results skipped: ", skipped)
203209
print("Please check the following references:\n", skipped_refs)
204210
else:
205-
print ("\n---------------------------------------------------------")
211+
print("\n---------------------------------------------------------")
206212
print('Done')
207213

208214
myFile.close()
209-
ref.close()

0 commit comments

Comments
 (0)