Skip to content

Commit

Permalink
fix(parse): replace \n with space in sentence split (Issue #506)
Browse files Browse the repository at this point in the history
  • Loading branch information
AndyTheFactory committed Nov 1, 2023
1 parent 9140a04 commit 3ccb87c
Showing 1 changed file with 13 additions and 3 deletions.
16 changes: 13 additions & 3 deletions newspaper/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from os import path

from collections import Counter
from typing import List

from . import settings

Expand Down Expand Up @@ -150,14 +151,23 @@ def split_words(text):
return None


def split_sentences(text):
"""Split a large string into sentences"""
def split_sentences(text: str) -> List[str]:
"""Split a large string into sentences. Uses the Punkt Sentence Tokenizer
from the nltk module to split strings into sentences.
Args:
text (str): input text
Returns:
List[str]: a list of sentences
"""
import nltk.data

# TODO: load a language specific tokenizer
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

sentences = tokenizer.tokenize(text)
sentences = [x.replace("\n", "") for x in sentences if len(x) > 10]
sentences = [re.sub("[\n ]+", " ", x) for x in sentences if len(x) > 10]
return sentences


Expand Down

0 comments on commit 3ccb87c

Please sign in to comment.