clean up code

TischerT · TischerT · commit a0187b7e79fb · 2020-11-25T16:39:03.000Z
diff --git a/endnote_convert.py b/endnote_convert.py
@@ -1,59 +1,59 @@
-import sys
-import os.path
 import re
-import string
 from bs4 import BeautifulSoup
 
 # HTM(L) full text file aka converted docx file
 input_file = input("HTM(L) file name (e.g. myText.htm):\n")
 
+
 def http_get(url):
-	fp = open(url, 'r')
-	return BeautifulSoup(fp, 'html.parser')
+    fp = open(url, 'r')
+    return BeautifulSoup(fp, 'html.parser')
+
 
 # this function does all the work
 # find superscript numbers and convert them to endnote readable unformatted references
 def replace_sup(file):
-	for i in file.find_all('sup'):
-		# single refs like 1 or 12
-		regex1 = r'yes">(\,\s){0,1}(\d+)</span>'
-		# a group of refs like 1, 2, 34
-		regex2 = r'yes">((\d+\,\s)+(\d+))</span>'
-		# range of refs like 1-4
-		regex3 = r'yes">((\d+)(\-)(\d+))</span>'
-		match1 = re.search(regex1, str(i), re.MULTILINE)
-		match2 = re.search(regex2, str(i), re.MULTILINE)
-		match3 = re.search(regex3, str(i), re.MULTILINE)
-		if match1:
-			m = str(match1.group(2))
-			n_m = re.sub(r'(\d+)', '{#' + r'\1' + '}', m)
-			n_m = n_m.replace(' ', '')
-		elif match2:
-			m = str(match2.group(1))
-			n_m = re.sub(r'(\d+)', '{#' + r'\1' + '}', m)
-			n_m = n_m.replace(', ', '')
-		elif match3:
-			m = str(match3.group(1))
-			first = int(match3.group(2))
-			last = int(match3.group(4))
-			r = ''
-			# fill in the range of refs, because endnote likes to have {#1}{#2}{#3}{#4}
-			while (first <= last):
-				a = '{#' + str(first) + '}'
-				r = r + a
-				first = int(first) + 1
-			n_m = re.sub(r'(\d+)(\-)(\d+)', r, m)
-		else:
-			continue		
-		i.replace_with(n_m)
-	return file
+    for i in file.find_all('sup'):
+        # single refs like 1 or 12
+        regex1 = r'yes">(\,\s){0,1}(\d+)</span>'
+        # a group of refs like 1, 2, 34
+        regex2 = r'yes">((\d+\,\s)+(\d+))</span>'
+        # range of refs like 1-4
+        regex3 = r'yes">((\d+)(\-)(\d+))</span>'
+        match1 = re.search(regex1, str(i), re.MULTILINE)
+        match2 = re.search(regex2, str(i), re.MULTILINE)
+        match3 = re.search(regex3, str(i), re.MULTILINE)
+        if match1:
+            m = str(match1.group(2))
+            n_m = re.sub(r'(\d+)', '{#' + r'\1' + '}', m)
+            n_m = n_m.replace(' ', '')
+        elif match2:
+            m = str(match2.group(1))
+            n_m = re.sub(r'(\d+)', '{#' + r'\1' + '}', m)
+            n_m = n_m.replace(', ', '')
+        elif match3:
+            m = str(match3.group(1))
+            first = int(match3.group(2))
+            last = int(match3.group(4))
+            r = ''
+            # fill in the range of refs, because endnote likes to have {#1}{#2}{#3}{#4}
+            while (first <= last):
+                a = '{#' + str(first) + '}'
+                r = r + a
+                first = int(first) + 1
+            n_m = re.sub(r'(\d+)(\-)(\d+)', r, m)
+        else:
+            continue
+        i.replace_with(n_m)
+    return file
+
 
 # main code
 if(input_file == ''):
-	print('Please provide a file')
+    print('Please provide a file')
 else:
     soup = http_get(input_file)
     soup = replace_sup(soup)
     with open(input_file + "_modified.htm", "wb") as f_output:
-    	f_output.write(soup.prettify("utf-8"))
-    	print('References exchanged. Ready to open the modified HTM file in Microsoft Word.')
+        f_output.write(soup.prettify("utf-8"))
+        print('References exchanged. Ready to open the modified HTM file in Microsoft Word.')
diff --git a/scrape_refs.py b/scrape_refs.py
@@ -1,11 +1,8 @@
-import sys
-import os.path
 import re
-import string
 import requests
 import random
 from bs4 import BeautifulSoup
-from urllib.parse import quote
+
 
 # iterations for random search
 iterations = 20
@@ -20,7 +17,8 @@
 # define some variables
 # pretend to be a firefox browser
 SESSION = requests.Session()
-SESSION.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
+SESSION.headers.update(
+    {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
 
 URL_SEARCH = 'https://pubmed.ncbi.nlm.nih.gov/?sort=date&term={q}'
 URL_CITE = 'https://pubmed.ncbi.nlm.nih.gov/{ident}/?format=pubmed'
@@ -36,12 +34,14 @@ def http_get(url):
     return BeautifulSoup(r.text, features="html.parser")
 
 # split reference into authors, year and title
+
+
 def split_reference(reference):
     r = reference
     left = ''
     right = ''
     year = ''
-    #find (YEAR)
+    # find (YEAR)
     regex = r'\(\d\d\d\d\)'
     match = re.search(regex, str(r))
     if match:
@@ -56,40 +56,42 @@ def split_reference(reference):
             return 'Error no YEAR found'
     # where is YEAR in reference? and how long is reference?
     pos_y = r.find(year)
-    l = len(r)
+    le = len(r)
     # if YEAR somewhere in the middle of reference, split into left-part YEAR right-part
-    if (l - pos_y >= 15):
+    if (le - pos_y >= 15):
         left, right = r[:pos_y], r[pos_y+6:]
     # else split on 'et al' into left-part 'et al' right-part
-    elif ((l - pos_y <= 15) and (r.find('et al'))!= -1):
+    elif ((le - pos_y <= 15) and (r.find('et al')) != -1):
         rs = r.split('et al')
         left = rs[0]
         right = rs[1]
     # else find the third full-stop from the end and split there into left-part full-stop right-part
-    else:  
+    else:
         rs = r.strip('.').rsplit('.', 2)
         left = rs[0]
         right = rs[1]
 
-    # clean up 
-    right = right.replace(year,'')
-    right = right.replace('  ',' ')
+    # clean up
+    right = right.replace(year, '')
+    right = right.replace('  ', ' ')
     right = right.strip()
-    left = left.replace('  ',' ')
+    left = left.replace('  ', ' ')
     left = left.strip()
-    year = year.replace('(','')
-    year = year.replace(')','')
+    year = year.replace('(', '')
+    year = year.replace(')', '')
     split_ref = [left, year, right]
     return split_ref
 
 # last fallback if no search quesries find a reference: use random word combinations from the title
-def choose_random(reference, l=3):
+
+
+def choose_random(reference, leng=3):
     q = reference.split()
     i = 1
     r = []
     while i < 6:
         rand = random.randint(1, len(q)-1)
-        if len(q[rand]) > l:
+        if len(q[rand]) > leng:
             if (q[rand] in r):
                 continue
             else:
@@ -100,6 +102,8 @@ def choose_random(reference, l=3):
     return r
 
 # search for Pubmed ID in results page
+
+
 def get_articles(query):
     url = URL_SEARCH.format(q=query)
     soup = http_get(url)
@@ -114,18 +118,20 @@ def get_articles(query):
     return pubmed
 
 # use Pubmed ID to create URL and copy entry in NML format
+
+
 def get_citations(ident, resolve=True):
     url = URL_CITE.format(ident=ident)
     soup = http_get(url)
-    citations =''
+    citations = ''
     for tag in soup.findAll(id="article-details"):
         citations = tag.string.strip()
     return citations
 
 
 # main code
 # open input and output fines
-if(ref_file ==''):
+if(ref_file == ''):
     print('Please provide a file')
     quit()
 
@@ -149,8 +155,8 @@ def get_citations(ident, resolve=True):
     if (reference == ''):
         continue
 
-    print ("\n---------------------------------------------------------")
-    print ("Doing reference:", reference)
+    print("\n---------------------------------------------------------")
+    print("Doing reference:", reference)
 
     # split the reference into author, year, title
     query = split_reference(reference)
@@ -161,49 +167,48 @@ def get_citations(ident, resolve=True):
 
     # find article by author and title
     r = get_articles(q[0] + ' ' + q[2])
-    print ("Query: " + q[0] + ' ' + q[2])
+    print("Query: " + q[0] + ' ' + q[2])
 
     # find article by author and year
     if len(r) == 0:
         r = get_articles(q[0] + ' ' + q[1])
-        print ("No results -- trying: " + q[0] + ' ' + q[1])
+        print("No results -- trying: " + q[0] + ' ' + q[1])
 
     # find article by year and title
     if len(r) == 0:
         r = get_articles(q[1] + ' ' + q[2])
-        print ("Still no results -- trying: " + q[1] + ' ' + q[2])
+        print("Still no results -- trying: " + q[1] + ' ' + q[2])
 
     # find article by author year and title
     if len(r) == 0:
         r = get_articles(q[0] + ' ' + q[1] + ' ' + q[2])
-        print ("Still no results -- trying: " + q[0] + ' ' + q[1] + ' ' + q[2])
+        print("Still no results -- trying: " + q[0] + ' ' + q[1] + ' ' + q[2])
 
     # find article by author year and random words from title
     if len(r) == 0:
         its = 0
         while its < iterations:
             q2 = choose_random(reference)
-            print ("Still no results -- trying again with random words: ", q[0] + q[1] + q2)
+            print("Still no results -- trying again with random words: ", q[0] + q[1] + q2)
             r = get_articles(q[0] + ' ' + q[1] + ' ' + q2)
             if len(r) != 0:
                 break
             its += 1
 
     if len(r) == 0:
-        print ("Still no results -- skipping")
+        print("Still no results -- skipping")
         skipped += 1
         skipped_refs = skipped_refs + '\n' + reference
         continue
-    print ("Result written")
+    print("Result written")
     myFile.write(get_citations(r) + '\n\n')
 
 if skipped > 0:
-    print ("\n---------------------------------------------------------")
+    print("\n---------------------------------------------------------")
     print("Total number of results skipped: ", skipped)
     print("Please check the following references:\n", skipped_refs)
 else:
-    print ("\n---------------------------------------------------------")
+    print("\n---------------------------------------------------------")
     print('Done')
 
 myFile.close()
-ref.close()