1
- import sys
2
- import os .path
3
1
import re
4
- import string
5
2
import requests
6
3
import random
7
4
from bs4 import BeautifulSoup
8
- from urllib . parse import quote
5
+
9
6
10
7
# iterations for random search
11
8
iterations = 20
20
17
# define some variables
21
18
# pretend to be a firefox browser
22
19
SESSION = requests .Session ()
23
- SESSION .headers .update ({'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' })
20
+ SESSION .headers .update (
21
+ {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' })
24
22
25
23
URL_SEARCH = 'https://pubmed.ncbi.nlm.nih.gov/?sort=date&term={q}'
26
24
URL_CITE = 'https://pubmed.ncbi.nlm.nih.gov/{ident}/?format=pubmed'
@@ -36,12 +34,14 @@ def http_get(url):
36
34
return BeautifulSoup (r .text , features = "html.parser" )
37
35
38
36
# split reference into authors, year and title
37
+
38
+
39
39
def split_reference (reference ):
40
40
r = reference
41
41
left = ''
42
42
right = ''
43
43
year = ''
44
- #find (YEAR)
44
+ # find (YEAR)
45
45
regex = r'\(\d\d\d\d\)'
46
46
match = re .search (regex , str (r ))
47
47
if match :
@@ -56,40 +56,42 @@ def split_reference(reference):
56
56
return 'Error no YEAR found'
57
57
# where is YEAR in reference? and how long is reference?
58
58
pos_y = r .find (year )
59
- l = len (r )
59
+ le = len (r )
60
60
# if YEAR somewhere in the middle of reference, split into left-part YEAR right-part
61
- if (l - pos_y >= 15 ):
61
+ if (le - pos_y >= 15 ):
62
62
left , right = r [:pos_y ], r [pos_y + 6 :]
63
63
# else split on 'et al' into left-part 'et al' right-part
64
- elif ((l - pos_y <= 15 ) and (r .find ('et al' ))!= - 1 ):
64
+ elif ((le - pos_y <= 15 ) and (r .find ('et al' )) != - 1 ):
65
65
rs = r .split ('et al' )
66
66
left = rs [0 ]
67
67
right = rs [1 ]
68
68
# else find the third full-stop from the end and split there into left-part full-stop right-part
69
- else :
69
+ else :
70
70
rs = r .strip ('.' ).rsplit ('.' , 2 )
71
71
left = rs [0 ]
72
72
right = rs [1 ]
73
73
74
- # clean up
75
- right = right .replace (year ,'' )
76
- right = right .replace (' ' ,' ' )
74
+ # clean up
75
+ right = right .replace (year , '' )
76
+ right = right .replace (' ' , ' ' )
77
77
right = right .strip ()
78
- left = left .replace (' ' ,' ' )
78
+ left = left .replace (' ' , ' ' )
79
79
left = left .strip ()
80
- year = year .replace ('(' ,'' )
81
- year = year .replace (')' ,'' )
80
+ year = year .replace ('(' , '' )
81
+ year = year .replace (')' , '' )
82
82
split_ref = [left , year , right ]
83
83
return split_ref
84
84
85
85
# last fallback if no search quesries find a reference: use random word combinations from the title
86
- def choose_random (reference , l = 3 ):
86
+
87
+
88
+ def choose_random (reference , leng = 3 ):
87
89
q = reference .split ()
88
90
i = 1
89
91
r = []
90
92
while i < 6 :
91
93
rand = random .randint (1 , len (q )- 1 )
92
- if len (q [rand ]) > l :
94
+ if len (q [rand ]) > leng :
93
95
if (q [rand ] in r ):
94
96
continue
95
97
else :
@@ -100,6 +102,8 @@ def choose_random(reference, l=3):
100
102
return r
101
103
102
104
# search for Pubmed ID in results page
105
+
106
+
103
107
def get_articles (query ):
104
108
url = URL_SEARCH .format (q = query )
105
109
soup = http_get (url )
@@ -114,18 +118,20 @@ def get_articles(query):
114
118
return pubmed
115
119
116
120
# use Pubmed ID to create URL and copy entry in NML format
121
+
122
+
117
123
def get_citations (ident , resolve = True ):
118
124
url = URL_CITE .format (ident = ident )
119
125
soup = http_get (url )
120
- citations = ''
126
+ citations = ''
121
127
for tag in soup .findAll (id = "article-details" ):
122
128
citations = tag .string .strip ()
123
129
return citations
124
130
125
131
126
132
# main code
127
133
# open input and output fines
128
- if (ref_file == '' ):
134
+ if (ref_file == '' ):
129
135
print ('Please provide a file' )
130
136
quit ()
131
137
@@ -149,8 +155,8 @@ def get_citations(ident, resolve=True):
149
155
if (reference == '' ):
150
156
continue
151
157
152
- print ("\n ---------------------------------------------------------" )
153
- print ("Doing reference:" , reference )
158
+ print ("\n ---------------------------------------------------------" )
159
+ print ("Doing reference:" , reference )
154
160
155
161
# split the reference into author, year, title
156
162
query = split_reference (reference )
@@ -161,49 +167,48 @@ def get_citations(ident, resolve=True):
161
167
162
168
# find article by author and title
163
169
r = get_articles (q [0 ] + ' ' + q [2 ])
164
- print ("Query: " + q [0 ] + ' ' + q [2 ])
170
+ print ("Query: " + q [0 ] + ' ' + q [2 ])
165
171
166
172
# find article by author and year
167
173
if len (r ) == 0 :
168
174
r = get_articles (q [0 ] + ' ' + q [1 ])
169
- print ("No results -- trying: " + q [0 ] + ' ' + q [1 ])
175
+ print ("No results -- trying: " + q [0 ] + ' ' + q [1 ])
170
176
171
177
# find article by year and title
172
178
if len (r ) == 0 :
173
179
r = get_articles (q [1 ] + ' ' + q [2 ])
174
- print ("Still no results -- trying: " + q [1 ] + ' ' + q [2 ])
180
+ print ("Still no results -- trying: " + q [1 ] + ' ' + q [2 ])
175
181
176
182
# find article by author year and title
177
183
if len (r ) == 0 :
178
184
r = get_articles (q [0 ] + ' ' + q [1 ] + ' ' + q [2 ])
179
- print ("Still no results -- trying: " + q [0 ] + ' ' + q [1 ] + ' ' + q [2 ])
185
+ print ("Still no results -- trying: " + q [0 ] + ' ' + q [1 ] + ' ' + q [2 ])
180
186
181
187
# find article by author year and random words from title
182
188
if len (r ) == 0 :
183
189
its = 0
184
190
while its < iterations :
185
191
q2 = choose_random (reference )
186
- print ("Still no results -- trying again with random words: " , q [0 ] + q [1 ] + q2 )
192
+ print ("Still no results -- trying again with random words: " , q [0 ] + q [1 ] + q2 )
187
193
r = get_articles (q [0 ] + ' ' + q [1 ] + ' ' + q2 )
188
194
if len (r ) != 0 :
189
195
break
190
196
its += 1
191
197
192
198
if len (r ) == 0 :
193
- print ("Still no results -- skipping" )
199
+ print ("Still no results -- skipping" )
194
200
skipped += 1
195
201
skipped_refs = skipped_refs + '\n ' + reference
196
202
continue
197
- print ("Result written" )
203
+ print ("Result written" )
198
204
myFile .write (get_citations (r ) + '\n \n ' )
199
205
200
206
if skipped > 0 :
201
- print ("\n ---------------------------------------------------------" )
207
+ print ("\n ---------------------------------------------------------" )
202
208
print ("Total number of results skipped: " , skipped )
203
209
print ("Please check the following references:\n " , skipped_refs )
204
210
else :
205
- print ("\n ---------------------------------------------------------" )
211
+ print ("\n ---------------------------------------------------------" )
206
212
print ('Done' )
207
213
208
214
myFile .close ()
209
- ref .close ()
0 commit comments