simple rule based named entity recognition
Available on pip
pip install simple_NER
Entities can be extracted with simple rules using Padaos, a dead simple regex parser
from simple_NER.rules import RuleNER
ner = RuleNER()
ner.add_rule("name", "my name is {person}")
for ent in ner.extract_entities("my name is jarbas"):
assert ent.as_json() == {'confidence': 1,
'data': {},
'entity_type': 'person',
'rules': [{'name': 'name',
'rules': ['my name is {person}']}],
'source_text': 'my name is jarbas',
'spans': [(11, 17)],
'value': 'jarbas'}
regex can also be used
from simple_NER.rules.rx import RegexNER
ner = RegexNER()
text = "i went to japan in 12/10/1996"
regex = r'((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))'
ner.add_rule("date", regex)
for e in ner.extract_entities(text):
assert e.as_json() == {'confidence': 1,
'data': {},
'entity_type': 'date',
'rules': [{'name': 'date',
'rules': [
'((0?[13578]|10|12)(-|\\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\\/)((\\d{4})|(\\d{2}))|(0?[2469]|11)(-|\\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\\/)((\\d{4}|\\d{2})))']}],
'source_text': 'i went to japan in 12/10/1996',
'spans': [(19, 29)],
'value': '12/10/1996'}
Entities are extracted using Padatious, An efficient and agile neural network intent parser
This will learn from the rules and extract more variations
from simple_NER.rules.neural import NeuralNER
ner = NeuralNER()
ner.add_rule("name", "my name is {person}")
for ent in ner.extract_entities("the name is jarbas"):
assert ent.as_json() == {'confidence': 0.5251495787186434,
'data': {},
'entity_type': 'person',
'rules': [{'name': 'name',
'rules': ['my name is {person}']}],
'source_text': 'the name is jarbas',
'spans': [(12, 18)],
'value': 'jarbas'}
for ent in ner.extract_entities("name is kevin"):
assert ent.as_json() == {'confidence': 0.8363423970007801,
'data': {},
'entity_type': 'person',
'rules': [{'name': 'name',
'rules': ['my name is {person}']}],
'source_text': 'name is kevin',
'spans': [(8, 13)],
'value': 'kevin'}
Extracting emails using regex rules
from simple_NER.annotators.email_ner import EmailNER
ner = EmailNER()
text = "my email is [email protected]"
for ent in ner.extract_entities(text):
assert ent.as_json() == {'confidence': 1,
'data': {},
'entity_type': 'email',
'rules': [{'name': 'email',
'rules': [
'(?:[a-z0-9!#$%&\\\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\\\'*+/=?^_`{|}~-]+)*|"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])']}],
'source_text': 'my email is [email protected]',
'spans': [(12, 34)],
'value': '[email protected]'}
Extracting Proper Nouns with regex
from simple_NER.annotators.names_ner import NamesNER
ner = NamesNER()
text = "I am JarbasAI , but my real name is Casimiro"
for e in ner.extract_entities(text):
print(e.as_json())
"""
{'entity_type': 'Noun', 'spans': [(5, 13)], 'value': 'JarbasAI', 'source_text': 'I am JarbasAI , but my real name is Casimiro', 'confidence': 0.8, 'data': {}, 'rules': [{'name': 'names_rx', 'rules': ["\\b((?:[A-Z][a-z][-A-Za-z']*(?: *[A-Z][a-z][-A-Za-z']*)*)\\b|\\b(?:[A-Z][a-z][-A-Za-z']*))\\b"]}]}
{'entity_type': 'Noun', 'spans': [(36, 44)], 'value': 'Casimiro', 'source_text': 'I am JarbasAI , but my real name is Casimiro', 'confidence': 0.8, 'data': {}, 'rules': [{'name': 'names_rx', 'rules': ["\\b((?:[A-Z][a-z][-A-Za-z']*(?: *[A-Z][a-z][-A-Za-z']*)*)\\b|\\b(?:[A-Z][a-z][-A-Za-z']*))\\b"]}]}
"""
Countries, Capital Cities and Cities can be looked up from a wordlist
from simple_NER.annotators.locations_ner import LocationNER, CitiesNER
ner = LocationNER()
# NOTE: case sensitive, enable detection of lowercase cities/countries
# ner = LocationNER(lowercase=True)
text = """The Capital of Portugal is Lisbon"""
for r in ner.extract_entities(text):
print(r.value, "-", r.entity_type)
print(r.as_json())
"""
Portugal - Country
{'confidence': 1,
'data': {'capital': 'Lisbon',
'country_code': 'PT',
'hemisphere': 'north',
'latitude': 39.5,
'longitude': -8,
'name': 'Portugal',
'timezones': ['Europe/Lisbon',
'Atlantic/Madeira',
'Atlantic/Azores']},
'entity_type': 'Country',
'rules': [],
'source_text': 'The Capital of Portugal is Lisbon',
'spans': [(15, 23)],
'value': 'Portugal'}
Lisbon - Capital City
{'confidence': 1,
'data': {'country_code': 'PT',
'country_name': 'Portugal',
'hemisphere': 'north',
'name': 'Lisbon'},
'entity_type': 'Capital City',
'rules': [],
'source_text': 'The Capital of Portugal is Lisbon',
'spans': [(27, 33)],
'value': 'Lisbon'}
"""
ner = CitiesNER()
# NOTE: case sensitive
# ner = CitiesNER(lowercase=True)
text = """Braga is in northern portugal"""
for r in ner.extract_entities(text):
print(r.value, "-", r.entity_type)
print(r.as_json())
"""
Braga - City
{'confidence': 1,
'data': {'country_code': 'PT',
'hemisphere': 'north',
'latitude': 41.55032,
'longitude': -8.42005,
'name': 'Braga'},
'entity_type': 'City',
'rules': [],
'source_text': 'Braga is in northern portugal',
'spans': [(0, 5)],
'value': 'Braga'}
"""
Datetime extraction is powered by lingua_franca
from simple_NER.annotators.datetime_ner import DateTimeNER
ner = DateTimeNER()
for r in ner.extract_entities("my birthday is on december 5th"):
assert r.entity_type == "relative_date"
print("day:", r.day, "month:", r.month, "year:", r.year)
"""
day: 5 month: 12 year: 2019
"""
for r in ner.extract_entities("entries are due by January 4th, 2017 at 8:30pm"):
assert r.entity_type == "relative_date"
print("day:", r.day,"month:", r.month, "year:", r.year, "hour:", r.hour,
"minute:", r.minute)
"""
day: 4 month: 1 year: 2017 hour: 20 minute: 30
"""
for r in ner.extract_entities(
"tomorrow is X yesterday was Y in 10 days it will be Z"):
assert r.entity_type == "relative_date"
print(r.value,
"day:", r.day, "month:", r.month, "year:", r.year,
"hour:", r.hour, "minute:", r.minute)
"""
tomorrow day: 30 month: 11 year: 2020 hour: 0 minute: 0
yesterday day: 28 month: 11 year: 2020 hour: 0 minute: 0
in 10 days day: 9 month: 12 year: 2020 hour: 0 minute: 0
"""
durations/timedeltas extraction is powered by lingua_franca
from simple_NER.annotators.datetime_ner import TimedeltaNER
ner = TimedeltaNER()
for r in ner.extract_entities(
"5 minutes ago was X 10 minutes from now is Y in 19 hours will "
"be N"):
assert r.entity_type == "duration"
print(r.value, r.total_seconds)
"""
5 minutes 300.0
10 minutes 600.0
19 hours 68400.0
"""
for r in ner.extract_entities(
"What President served for five years six months 2 days"):
# NOTE months/years are not supported because they are not explicit
# how many days is 1 month? how many days is 1 year?
assert r.entity_type == "duration"
print(r.value, r.total_seconds)
"""2 days 172800.0"""
for r in ner.extract_entities("starts in 5 minutes"):
assert r.entity_type == "duration"
print(r.value, r.total_seconds)
"""5 minutes 300.0"""
for r in ner.extract_entities("starts in five minutes"):
assert r.entity_type == "duration"
print(r.value, r.total_seconds)
"""5 minutes 300.0"""
Using Quantulum3 for information extraction of quantities, measurements and their units from unstructured text
from simple_NER.annotators.units_ner import UnitsNER
ner = UnitsNER()
for r in ner.extract_entities("The LHC smashes proton beams at 12.8–13.0 TeV"):
assert r.data_value == 12.9
assert r.unit.name == "teraelectronvolt"
assert r.value == "12.8–13.0 TeV"
assert r.as_json() == \
{'confidence': 1,
'data': {'lang': 'en_US',
'spoken': 'twelve point nine teraelectron volts',
'uncertainty': 0.09999999999999964,
'unit': {'dimensions': [
{'base': 'teraelectronvolt', 'power': 1}],
'entity': {
'dimensions': [{'base': 'force', 'power': 1},
{'base': 'length',
'power': 1}],
'name': 'energy',
'uri': 'Energy'},
'lang': 'en_US',
'name': 'teraelectronvolt',
'original_dimensions': [
{'base': 'teraelectronvolt',
'power': 1,
'surface': 'TeV'}],
'surfaces': ['teraelectron volt',
'teraelectronvolt',
'teraelectron-volt'],
'symbols': ['TeV'],
'uri': 'Electronvolt'},
'value': 12.9},
'entity_type': 'Energy:Electronvolt',
'rules': [],
'source_text': 'The LHC smashes proton beams at 12.8–13.0 TeV',
'spans': [(32, 45)],
'value': '12.8–13.0 TeV'}
The most relevant keywords can be annotated using Rake
from simple_NER.annotators.keyword_ner import KeywordNER
ner = KeywordNER()
text = "Mycroft is a free and open-source voice assistant for Linux-based operating systems that uses a natural language user interface"
# extract keywords
ents = list(ner.extract_entities(text)) # generator, needs list()
# group into tuples of (keyword, score)
keywords = [(ent.value, ent.score) for ent in ents]
keywords = sorted(keywords) # sort alphabetically
assert sorted(keywords) == [('free', 1.0),
('linux-based operating systems', 9.0),
('mycroft', 1.0),
('natural language user interface', 16.0),
('open-source voice assistant', 9.0)]
Extraction of written numbers is powered by lingua_franca
from simple_NER.annotators.numbers_ner import NumberNER
ner = NumberNER()
for r in ner.extract_entities("three hundred trillion tons of spinning metal"):
"""
{'confidence': 1,
'data': {'number': '300000000000000.0'},
'entity_type': 'written_number',
'rules': [],
'source_text': 'three hundred trillion tons of spinning metal',
'spans': [(0, 22)],
'value': 'three hundred trillion'}
"""
ner = NumberNER(short_scale=False)
for r in ner.extract_entities("three hundred trillion tons of spinning metal"):
"""
{'confidence': 1,
'data': {'number': '3e+20'},
'entity_type': 'written_number',
'rules': [],
'source_text': 'three hundred trillion tons of spinning metal',
'spans': [(0, 22)],
'value': 'three hundred trillion'}
"""
ner = NumberNER()
for r in ner.extract_entities("the 5th number of the third thing"):
"""
{'confidence': 1,
'data': {'number': '5'},
'entity_type': 'written_number',
'rules': [],
'source_text': 'the 5th number of the third thing',
'spans': [(4, 7)],
'value': '5th'}
{'confidence': 1,
'data': {'number': '3'},
'entity_type': 'written_number',
'rules': [],
'source_text': 'the 5th number of the third thing',
'spans': [(22, 27)],
'value': 'third'}
"""
ner = NumberNER(ordinals=False)
for r in ner.extract_entities("the 5th number of the third thing"):
"""
{'confidence': 1,
'data': {'number': '5'},
'entity_type': 'written_number',
'rules': [],
'source_text': 'the 5th number of the third thing',
'spans': [(4, 7)],
'value': '5th'}
{'confidence': 1,
'data': {'number': '0.3333333333333333'},
'entity_type': 'written_number',
'rules': [],
'source_text': 'the 5th number of the third thing',
'spans': [(22, 27)],
'value': 'third'}
"""
Some web based annotators are also provided
Using spotlight we can annotate entities from dbpedia
from simple_NER.annotators.remote.dbpedia import SpotlightNER
# you can also self host
host='http://api.dbpedia-spotlight.org/en/annotate'
ner = SpotlightNER(host)
for r in ner.extract_entities("London was founded by the Romans"):
print(r.value, r.entity_type, r.uri)
score = r.similarityScore
"""
London Wikidata:Q515 http://dbpedia.org/resource/London
London Wikidata:Q486972 http://dbpedia.org/resource/London
London Schema:Place http://dbpedia.org/resource/London
London Schema:City http://dbpedia.org/resource/London
London DBpedia:Settlement http://dbpedia.org/resource/London
London DBpedia:PopulatedPlace http://dbpedia.org/resource/London
London DBpedia:Place http://dbpedia.org/resource/London
London DBpedia:Location http://dbpedia.org/resource/London
London DBpedia:City http://dbpedia.org/resource/London
Romans Wikidata:Q6256 http://dbpedia.org/resource/Ancient_Rome
Romans Schema:Place http://dbpedia.org/resource/Ancient_Rome
Romans Schema:Country http://dbpedia.org/resource/Ancient_Rome
Romans DBpedia:PopulatedPlace http://dbpedia.org/resource/Ancient_Rome
Romans DBpedia:Place http://dbpedia.org/resource/Ancient_Rome
Romans DBpedia:Location http://dbpedia.org/resource/Ancient_Rome
Romans DBpedia:Country http://dbpedia.org/resource/Ancient_Rome
"""
wrappers are also provided for performing NER with external libs
If you have snips_nlu installed you can extract the builtin entities
from simple_NER.annotators.snips_ner import SnipsNER
ner = SnipsNER()
text = "The farmer had 2 cows, The cows died after 5 days."
for e in ner.extract_entities(text):
print(e.value, e.entity_type)
"""
2 snips/number
after 5 days snips/date
"""
from simple_NER.annotators.nltk_ner import NltkNER
ner = NltkNER()
text = """The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world"."""
for r in ner.extract_entities(text):
print(r.value, r.entity_type)
"""
Israeli GPE
Benjamin Netanyahu PERSON
Iran GPE
"""