-
Notifications
You must be signed in to change notification settings - Fork 2
/
recipe_parser.py
105 lines (83 loc) · 3.89 KB
/
recipe_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import spacy, pprint, re
from spacy import displacy
from fractions import Fraction
from first_dataset import ingredient_dataset
test_recipe_text = ["1 cup flour, sifted",
"1 teaspoons sea salt",
"1 egg",
"1/2 cup milk",
"1 rounded tablespoon baking powder",
"2 tablespoons olive oil",
"3 cups lightly toasted sesame seeds",
"1 (8 oz) package ground beef"]
recipe_example = {'flour': {'cup': 1},
'salt': {'teaspoon': 1},
'egg': {'whole': 1},
'milk': {'cup': .5},
'baking powder': {'tablespoon': 1}}
class RecipeNLP:
def __init__(self):
self.nlp = spacy.load("en_core_web_sm")
def parse(self, recipe_line):
return ParsedLine(self.nlp, recipe_line)
class ParsedLine:
def __init__(self, nlp, recipe_line):
nlp_ing = nlp(recipe_line)
self.measurement_regex = re.compile('(cup|tablespoon|teaspoon|oz|pound|ounce|clove|cube)s?')
self.ingredient = ""
self.measurement = ""
self.amount = 0
print()
print(recipe_line)
for token in nlp_ing:
print(token, token.dep_, token.pos_)
# TODO: determine the amount
if self.measurement_regex.search(token.text):
print("the measurement is {}".format(token))
self.measurement = self.measurement_regex.search(token.text).group(1)
# TODO: determine the ingredient
elif token.dep_ in ['ROOT', 'nsubj', 'dobj', 'appos']:
if token.pos_ in ['NOUN', 'PROPN']:
print("the ingredient is {}".format(token))
if token.lower_ in ['salt', 'water']: # ignore certain things that pretty much everyone has
print('ingredient matches "ignore" list')
continue
# TODO: check if there are any compound words
print(list(token.children))
print('checking children for compounds')
for child in token.children:
print('checking {}'.format(child))
if not self.measurement_regex.match(child.text):
print('not a measurement')
if child.dep_ in ['compound', 'amod']:
print('found a compound')
self.ingredient += child.text + " "
self.ingredient += token.text + " "
if token.pos_ == 'NUM': # sometimes the amount is registered as nsubj
try:
self.amount = Fraction(token.text)
except ValueError as err:
print("ValueError: {}".format(err))
# TODO: determine the measurement (if any)
elif token.pos_ == 'NUM':
print("the amount is {}".format(token))
try:
self.amount = Fraction(token.text)
except ValueError as err:
print("ValueError: {}".format(err))
elif token.pos_ == 'X' and token.dep_ == 'nummod':
print("the amount is {}".format(token))
try:
self.amount = Fraction(token.text)
except ValueError as err:
print("ValueError: {}".format(err))
if __name__ == '__main__':
training_set = []
recipe_nlp = RecipeNLP()
for line in ingredient_dataset:
parsed_line = recipe_nlp.parse(line)
training_set.append((line, [parsed_line.amount, parsed_line.measurement, parsed_line.ingredient]))
pp = pprint.PrettyPrinter()
pp.pprint(training_set)
with open('annotated_data_old.py', 'w') as annotated_file:
annotated_file.write(pp.pformat(training_set))