-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCausalPhrasesExtraction.py
94 lines (71 loc) · 2.63 KB
/
CausalPhrasesExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 12 16:27:24 2023
@author: Choungryeol Axl Lee
"""
import os
import re
import glob
import numpy as np
import spacy as sp
import pandas as pd
import spacy_transformers
from spacy import displacy
from operator import concat
from functools import reduce
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, Isomap
from sentence_transformers import SentenceTransformer, util
# Texts from Transport Policy journal, abstracts only.
abstract = []
keywords = []
year = []
# read in the whole file
Corpuses = glob.glob("./articles/*.txt")
for file in Corpuses:
with open(file, "r", encoding="utf-8") as f:
inputdata = f.readlines()
for i in inputdata:
if 'Abstract:' in i:
temp_text = i.split(':', 1)[1].strip()
if 'ABSTRACT' not in temp_text:
abstract.append(temp_text)
if 'Keywords:' in i:
keywords.append(i.split(':')[1].strip())
d = [i-1 for i, j in enumerate(inputdata) if 'Pages' in j]
Year = [inputdata[i].strip()[:-1] for i in d]
year.append(Year)
year = reduce(concat, year)
data = pd.DataFrame(list(zip(abstract, keywords, year)), columns= ['Abstract', 'Keywords', 'Year'])
# Load the trained best model, and load the default model
# for sentence separation
nlp = sp.load('model-best')
nlp_ = sp.load('en_core_web_lg')
# Converting abstract to individual sentences using spacy default model
i = -1
Sentence = []
abs_id = []
for abstract in data.Abstract:
i = i+1
s_ = nlp_(abstract)
for sent in s_.sents:
if len(sent) > 2:
Sentence.append(sent.text)
abs_id.append(i)
df = pd.DataFrame([Sentence, abs_id]).transpose()
df.columns = ['sentence', 'abstractID']
# Causal Phrases extraction from the transport policy literature
causal_sentence = []
for index, row in df.iterrows():
d = nlp(row['sentence'])
if d.cats['CAUSAL'] > 0.9:
causal_sentence.append([row['sentence'], row['abstractID']])
# Save the file
phrases_causal = pd.DataFrame(causal_sentence, columns=['sentence', 'abstractID'])
#phrases_causal.to_csv('Causal Phrases.csv', index = False, encoding = 'utf-8')
# Import the model that was trained on scientific literature corpus
model = SentenceTransformer('allenai-specter')
# find the embeddings for each causal sentence
corpus_embeddings = model.encode(phrases_causal.sentence.to_list(), convert_to_tensor=True)
corpus_embeddings.to_csv('causal_phrases_embeddings.csv', index = False)