-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMaxArtDistance.py
62 lines (56 loc) · 2.63 KB
/
MaxArtDistance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import xml.etree.ElementTree as ET
import os
def maxproiel(froot, maxds):
"""Returns the maximum distance that any article is from the beginning of a sentence and from the end
of a sentence."""
leftmost = maxds[0]
rightmost = maxds[1]
for source in froot:
for division in source:
for sentence in division:
for token in sentence:
if token.get('lemma') == 'ὁ':
i = -1
while sentence[i].get('empty-token-sort'):
i -= 1
testnumber = (int(sentence[i].get('id')) - int(token.get('id')))
if testnumber > rightmost:
rightmost = testnumber
print("New max:", leftmost, rightmost, "at", token.get('citation-part'))
testnumber = (int(sentence[0].get('id')) - int(token.get('id')))
if testnumber < leftmost:
leftmost = testnumber
print("New min:", leftmost, rightmost, "at", token.get('citation-part'))
maxds = [leftmost, rightmost]
return maxds
def maxperseus(froot, maxds):
"""Returns the maximum distance that any article is from the beginning of a sentence and from the end
of a sentence."""
leftmost = maxds[0]
rightmost = maxds[1]
for body in froot:
for sentence in body:
sentlist = sentence.findall('word')
for word in sentlist:
if word.get('lemma') == 'ὁ':
testnumber = (int(sentlist[-1].get('id')) - int(word.get('id')))
if testnumber > rightmost:
rightmost = testnumber
print("New max:", leftmost, rightmost, "at", sentence.get('subdoc'), "in", froot.get('cts'))
testnumber = (int(sentlist[0].get('id')) - int(word.get('id')))
if testnumber < leftmost:
leftmost = testnumber
print("New min:", leftmost, rightmost, "at", sentence.get('subdoc'), "in", froot.get('cts'))
maxds = [leftmost, rightmost]
return maxds
maxDs = [0,0]
os.chdir('/home/chris/Desktop/CustomTB')
indir = os.listdir('/home/chris/Desktop/CustomTB')
for file_name in indir:
if not file_name == 'README.md' and not file_name == '.git':
tb = ET.parse(file_name)
tbroot = tb.getroot()
if tbroot.tag == 'proiel':
maxDs = maxproiel(tbroot, maxDs)
if tbroot.tag == 'treebank':
maxDs = maxperseus(tbroot, maxDs)