-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_keybert.py
68 lines (52 loc) · 1.82 KB
/
run_keybert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
from keybert import KeyBERT
from konlpy.tag import Mecab
import time
# +-----------------------+
# | MECAB FUNC |
# +-----------------------+
mecab = Mecab(dicpath=r"C:\mecab\mecab-ko-dic")
def use_mecab(passage):
# objArrMecab = mecab.pos(passage)
objArrMecab = mecab.nouns(passage)
# DELETE LEN == 1 WORD
L = 0
while(L < 2):
for i, v in enumerate(objArrMecab):
if len(v) < 2:
objArrMecab.pop(i)
L = len(v)
break
else:
L = len(v)
# srcSpacing = ' '.join([i for (i, j) in objArrMecab if ('NN' in j or 'XR' in j or 'VA' in j or 'VV' in j)])
nouns_passage = ' '.join([i for i in objArrMecab])
return nouns_passage
keyword_result = []
# +-------------------------------------+
# | KEYBERT MODEL DEFINE
# +-------------------------------------+
start = time.time()
kw_model = KeyBERT(model="distiluse-base-multilingual-cased-v1")
print(time.time() - start)
# +-------------------------------------+
# | READ FILE
# +-------------------------------------+
filename = "reranking_10000_re.csv"
csv_f = pd.read_csv(filename, encoding='UTF-8')
# +-------------------------------------+
# | START
# +-------------------------------------+
start = time.time()
# for i in range(0, len(csv_f.index)):
for i in range(0, 10):
passage = str(csv_f['Column1'][i]) + ' ' + str(csv_f['Column2'][i])
passage = use_mecab(passage)
# KEY EXTRACTION
keyword_result.append(kw_model.extract_keywords(passage, keyphrase_ngram_range=(1, 3), top_n=5))
print(time.time() - start)
# +-------------------------------------+
# | RESULT
# +-------------------------------------+
data_df = pd.DataFrame(keyword_result)
data_df.to_excel('keyword_result.xlsx')