-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathner.py
65 lines (47 loc) · 2.24 KB
/
ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
"""
******************************************************************************************************************
@author: Ananya Mukherjee
Name: ner.py
Description : This program retrieves the named entity tags using SpaCy's Model trained on OntoNotes5.
Input : Input Type and Input Text.
If the input provided is URL then extracts the webpage by webscrapping and further Named Entity Recognition is applied.
If the input is in text form then Named Entity Recognition is applied on the sentences.
Output : Returns the NER tagged annotated text marked by HTML (for display purpose in client's browser) and Plain Annotated Text.
******************************************************************************************************************
"""
from bs4 import BeautifulSoup
import requests
import re
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()
class RetrieveNER:
def __init__(self,inpTyp,input):
self.inpTyp = inpTyp
self.input = str(input)
def renderNerOutput(self,text):
return displacy.render(text, jupyter=None, style='ent')
def getAnnotatedOutput(self,document):
return [(X, X.ent_type_) for X in document]
def url_to_string(self,url):
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html5lib')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
def getNerTagsFromURl(self,url):
ny_bb = self.url_to_string(url)
return (self.renderNerOutput(nlp(ny_bb)),self.getAnnotatedOutput(nlp(ny_bb)))
def getNerTagsFromText(self,inputText):
document = nlp(inputText)
sentences = [x.text for x in document.sents]
sentences = nlp(str(" ".join(sentences)))
return (self.renderNerOutput(sentences),self.getAnnotatedOutput(document))
def RetrieveNER(self):
if(self.inpTyp == 'url'):
htmltags,tags = self.getNerTagsFromURl(self.input)
elif(self.inpTyp == 'text'):
htmltags,tags = self.getNerTagsFromText(self.input)
return htmltags,str(tags)