-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsentence_splitter.py
116 lines (95 loc) · 4.42 KB
/
sentence_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from __future__ import annotations
import logging
import ssl
from typing import TypeVar
import nltk
from flair.splitter import SegtokSentenceSplitter
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.documents import TextDocumentWithLabeledPartitions
logger = logging.getLogger(__name__)
D = TypeVar("D", bound=TextDocumentWithLabeledPartitions)
class NltkSentenceSplitter:
"""A document processor that adds sentence partitions to a TextDocumentWithLabeledPartitions document.
It uses the NLTK Punkt tokenizer to split the text of the document into sentences. See
https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.punkt.PunktSentenceTokenizer for more information.
Args:
partition_layer_name: The name of the partition layer to add the sentence partitions to. This layer
must be an AnnotationLayer of LabeledSpan annotations.
text_field_name: The name of the text field in the document to split into sentences.
sentencizer_url: The URL to the NLTK Punkt tokenizer model.
inplace: A boolean value that determines whether the sentence partitions are added to the input document
or a new document is created.
"""
def __init__(
self,
partition_layer_name: str = "labeled_partitions",
text_field_name: str = "text",
sentencizer_url: str = "tokenizers/punkt/PY3/english.pickle",
inplace: bool = True,
):
self.partition_layer_name = partition_layer_name
self.text_field_name = text_field_name
self.inplace = inplace
# download the NLTK Punkt tokenizer model
nltk.download("punkt")
self.sentencizer = nltk.data.load(sentencizer_url)
def __call__(self, document: D) -> D:
if not self.inplace:
document = document.copy()
partition_layer = document[self.partition_layer_name]
if len(partition_layer) > 0:
logger.warning(
f"Layer {self.partition_layer_name} in document {document.id} is not empty. "
f"Clearing it before adding new sentence partitions."
)
partition_layer.clear()
text: str = getattr(document, self.text_field_name)
sentence_spans = self.sentencizer.span_tokenize(text)
sentences = [
LabeledSpan(start=start, end=end, label="sentence") for start, end in sentence_spans
]
partition_layer.extend(sentences)
return document
class FlairSegtokSentenceSplitter:
"""A document processor that adds sentence partitions to a TextDocumentWithLabeledPartitions document.
It uses the Flair SegtokSentenceSplitter to split the text of the document into sentences. See
https://github.com/flairNLP/flair/blob/master/flair/splitter.py for more information.
Args:
partition_layer_name: The name of the partition layer to add the sentence partitions to. This layer
must be an AnnotationLayer of LabeledSpan annotations.
text_field_name: The name of the text field in the document to split into sentences.
inplace: A boolean value that determines whether the sentence partitions are added to the input document
or a new document is created.
"""
def __init__(
self,
partition_layer_name: str = "labeled_partitions",
text_field_name: str = "text",
inplace: bool = True,
):
self.partition_layer_name = partition_layer_name
self.text_field_name = text_field_name
self.sentencizer = SegtokSentenceSplitter()
self.inplace = inplace
def __call__(self, document: D) -> D:
if not self.inplace:
document = document.copy()
partition_layer = document[self.partition_layer_name]
if len(partition_layer) > 0:
logger.warning(
f"Layer {self.partition_layer_name} in document {document.id} is not empty. "
f"Clearing it before adding new sentence partitions."
)
partition_layer.clear()
text: str = getattr(document, self.text_field_name)
sentence_spans = self.sentencizer.split(text)
sentences = [
LabeledSpan(
start=sentence.start_position,
end=sentence.start_position + len(sentence.text),
label="sentence",
)
for sentence in sentence_spans
]
partition_layer.extend(sentences)
return document