-
Notifications
You must be signed in to change notification settings - Fork 0
/
scopus_clean_tokenize.py
95 lines (77 loc) · 2.13 KB
/
scopus_clean_tokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from nltk.tokenize import word_tokenize
from string import punctuation
my_punctuation = punctuation.replace('-', '')
my_punctuation = my_punctuation.replace('(', '')
my_punctuation = my_punctuation.replace(')', '')
target_indicators = ["detect"]
def upper_count(s):
count = 0
for c in s:
count += c.isupper()
return count
def window(l, start, end):
start = max(0, start)
end = min(len(l)-1, end)
return l[start:end+1]
def find_acronyms(tok):
start = False
acronym_start = 0
acronym_parts = []
acronyms = {}
for i in range(len(tok)):
if tok[i] == "(":
start = True
acronym_start = i + 1
elif tok[i] == ")":
start = False
acronym = ' '.join(acronym_parts)
pre_acronym = window(tok, acronym_start-1 - len(acronym) - 2, acronym_start - 2)
acronyms[acronym] = ' '.join(pre_acronym)
elif start:
acronym_parts.append(tok[i])
return acronyms
count = 0
abstract_words = {}
total_words = 0
with open("output.csv", encoding="utf8") as f:
lines = f.readlines()
i = 0
for line in lines:
if i == 0:
i += 1
continue
line = line.split('\t')
year = line[0]
page = line[1]
title = line[2]
title_tokenized = word_tokenize(title)
abstract = line[3]
abstract = abstract.translate(str.maketrans("", "", my_punctuation)).lower()
abstract_tokenized = word_tokenize(abstract)
for word in abstract_tokenized:
abstract_words.setdefault(word, 0)
abstract_words[word] += 1
total_words += 1
for word in abstract_words.keys():
abstract_words[word] /= total_words
with open("words.csv", "w", encoding = "utf8") as f:
for word in abstract_words.keys():
f.write(word + "\t" + str(abstract_words[word]) + "\n")
with open("output.csv", encoding="utf8") as f:
lines = f.readlines()
i = 0
for line in lines:
if i == 0:
i += 1
continue
line = line.split('\t')
year = line[0]
page = line[1]
title = line[2]
title_tokenized = word_tokenize(title)
abstract = line[3]
abstract = abstract.translate(str.maketrans("", "", my_punctuation)).lower()
abstract_tokenized = word_tokenize(abstract)
for word in abstract_tokenized:
frequency_in_abstract =
representation = abstract_words[word]