-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhamr.py
62 lines (48 loc) · 1.73 KB
/
hamr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#! /usr/bin/python
"""This is a prototype for the HAMR metadata enhancement process"""
import urllib
from xml.dom.minidom import parse, parseString
class Harvester(object):
def __init__(self, url, form):
"""Initialize with url?"""
self.url = url
self.form = form
def get_record(self, identifier):
"""Take an identifier and grab the record"""
x = urllib.urlopen(self.url + identifier)
return x.read()
def handle_metadata(self, record):
r = {}
if self.form == 'xml':
dom = parseString(record)
r['title'] = self._getText(dom.getElementsByTagName(
"ArticleTitle")[0].childNodes)
authors = dom.getElementsByTagName("Author")
for author in authors:
# extract out the text elements from each author then push
# into the metadata format
print author
a = self._getText(author.childNodes)
print author.childNodes.length
return r
def dc_matching(self):
"""Walk through fields ... """
pass
def string_scoring(self, s1, s2):
"""Compare 2 strings and return a similarity score"""
pass
def output_matches(self):
"""output our xml for styling"""
return
def _getText(self, nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
if __name__ == "__main__":
u = ('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=' +
'pubmed&retmode=xml&id=')
h = Harvester(u, 'xml')
r = h.get_record('11748933')
print h.handle_metadata(r)