-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathquery.py
115 lines (98 loc) · 2.84 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
import json
import requests
import subprocess
from datetime import datetime
#dict storing data
collection={}
def execute_commandRealtime(cmd):
"""Execute shell command and print stdout in realtime.
Function taken from pyrpipe Singh et.al. 2020
usage:
for output in execute_commandRealtime(['curl','-o',outfile,link]):
print (output)
"""
popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True)
for stdout_line in iter(popen.stdout.readline, ""):
yield stdout_line
popen.stdout.close()
return_code = popen.wait()
if return_code:
raise subprocess.CalledProcessError(return_code, cmd)
def update_collection():
'''
Download bioarxiv and medarxiv collections
'''
link='https://connect.biorxiv.org/relate/collection_json.php?grp=181'
outfile='collection.json'
print('Downloading ...')
for output in execute_commandRealtime(['curl','-o',outfile,link]):
print (output)
def read_collection():
'''
open file
'''
filename='collection.json'
with open(filename) as f:
data = json.load(f)
i=0
for key,value in data.items() :
#print (key,":",value)
if key=='rels':
val=data[key]
print('{} records found'.format(len(val)))
return value
def get_terms():
print('Available terms:')
for key,value in collection[0].items():
print(key)
def searchall(keywords):
result=[]
for k in keywords:
result.extend(search(k))
return result
def search(term):
#search in collection is a list of dicts
print('Searching',term)
result=[]
for d in collection:
#seach in all keys
for key,value in d.items():
if term.lower() in str(value).lower():
#print (d['rel_title'])
result.append(d)
#print('total matches: {}'.format(len(result)))
return result
def get_title(res):
titles=[]
for d in res:
if not d['rel_title'] in titles:
titles.append(d['rel_title'])
#print(d['rel_title'])
return titles
def filter_date(res,startdate):
'''
keep results by date
'''
filtered=[]
for d in res:
if datetime.strptime(d['rel_date'], '%Y-%m-%d')>=startdate:
filtered.append(d)
return filtered
#step 1 update collection downloads around 15 MB .json data
#update_collection()
#read collection in memory
collection=read_collection()
#see available terms
#get_terms()
#perform search
#res=search(' RNA-seq')
tosearch=[' RNA-seq','transcriptom','express','sequencing']
res=searchall(tosearch)
print(len(res))
print(len(get_title(res)))
fdate=datetime.strptime('2020-06-25', '%Y-%m-%d')
print('filtering results before',fdate)
final_res=get_title(filter_date(res,fdate))
print(len(final_res))
print('\n'.join(final_res))