-
Notifications
You must be signed in to change notification settings - Fork 116
/
textcat_custom_model.py
74 lines (63 loc) · 3.14 KB
/
textcat_custom_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.sorters import prefer_uncertain
from prodigy.util import split_string
import random
from typing import List, Iterable
class DummyModel(object):
# This is a dummy model to help illustrate how to use Prodigy with a model
# in the loop. It currently "predicts" random numbers – but you can swap
# it out for any model of your choice, for example a text classification
# model implementation using PyTorch, TensorFlow or scikit-learn.
def __init__(self, labels: List[str]):
# The model can keep arbitrary state – let's use a simple random float
# to represent the current weights
self.weights = random.random()
self.labels = labels
def __call__(self, stream: Iterable[dict]):
for eg in stream:
# Score the example with respect to the current weights and
# assign a label
eg["label"] = random.choice(self.labels)
score = (random.random() + self.weights) / 2
yield (score, eg)
def update(self, answers: List[dict]):
# Update the model weights with the new answers. This method receives
# the examples with an added "answer" key that either maps to "accept",
# "reject" or "ignore".
self.weights = random.random()
# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
"textcat.custom-model",
dataset=("The dataset to use", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
)
def textcat_custom_model(dataset: str, source: str, label: List[str]):
"""
Use active learning-powered text classification with a custom model. To
demonstrate how it works, this demo recipe uses a simple dummy model that
"precits" random scores. But you can swap it out for any model of your
choice, for example a text classification model implementation using
PyTorch, TensorFlow or scikit-learn.
"""
# Load the stream from a JSONL file and return a generator that yields a
# dictionary for each example in the data.
stream = JSONL(source)
# Load the dummy model
model = DummyModel(labels=label)
# Use the prefer_uncertain sorter to focus on suggestions that the model
# is most uncertain about (i.e. with a score closest to 0.5). The model
# yields (score, example) tuples and the sorter yields just the example
stream = prefer_uncertain(model(stream))
# The update method is called every time Prodigy receives new answers from
# the web app. It can be used to update the model in the loop.
update = model.update
return {
"view_id": "classification", # Annotation interface to use
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"update": update, # Update callback, called with batch of answers
}