-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNLTK-Tutorial-Python2.py
125 lines (68 loc) · 3.39 KB
/
NLTK-Tutorial-Python2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# coding: utf-8
# In[ ]:
# Let's import the nltk module
import nltk
# To start, we need some text to play with. NLTK has many corpora and resources for you to explore natural language.
# A one-off run of nltk.download() will get you all the resources in one go. Once you've done that you should have
# a repository of interesting texts including stuff like Moby Dick and an Inaugural Address Corpus
from nltk.book import *
# In[ ]:
# These texts have now been loaded and you can refer to them by their names. These are objects of type 'Text' and they have a
# bunch of cool methods to explore the text
# concordance will print all the occurrences of a word along with some context. Let's explore two texts - Moby Dick and
# Sense and Sensibility. As expected, word usage and language in both these books are pretty different :)
text1.concordance("monstrous")
# In[ ]:
text2.concordance("monstrous")
# In[ ]:
# As you can see, Melville uses the word 'monstrous' in a different connotation than Austen. He uses it to indicate
# size and things that are terrifying, Austen uses it in a positive connotation
# Let's see what other words appear in the same context as monstrous
text2.similar("monstrous")
# In[ ]:
# Clearly Austen uses "monstrous" to represent positive emotions and to amplify those emotions. She seems to use it
# interchangeably with "very"
text2.common_contexts(["monstrous","very"])
# In[ ]:
# These are fun ways to explore the usage of natural language in different contexts or situations. Let's see how the
# usage of certain words by Presidents has changed over the years.
# (Do install matplotlib before you run the below line of code)
text4.dispersion_plot(["citizens","democracy","freedom","duties","America"])
# In[ ]:
# Let's see what kind of emotions are expressed in Jane Austen's works vs Herman Melville's
text2.dispersion_plot(["happy","sad"])
# In[ ]:
text1.dispersion_plot(["happy","sad"])
# In[ ]:
# Now let's get to some serious stuff. Often you want to extract features from
# a text - these are attributes that will represent the text - words or sentences
# How do we split a piece of text into constituent sentences/words? (these are called tokens)
from nltk.tokenize import word_tokenize, sent_tokenize
text="Mary had a little lamb. Her fleece was white as snow"
sents=sent_tokenize(text)
print(sents)
# In[ ]:
words=[word_tokenize(sent) for sent in sents]
print(words)
# In[ ]:
# Let's filter out stopwords (words that are very common like 'was', 'a', 'as etc)
from nltk.corpus import stopwords
from string import punctuation
customStopWords=set(stopwords.words('english')+list(punctuation))
#Notice how we made the stopwords a set
wordsWOStopwords=[word for word in word_tokenize(text) if word not in customStopWords]
print(wordsWOStopwords)
# In[ ]:
text2="Mary closed on closing night when she was in the mood to close."
# 'close' appears in different morphological forms here, stemming will reduce all forms of the word 'close' to its root
# NLTK has multiple stemmers based on different rules/algorithms. Stemming is also known as lemmatization.
from nltk.stem.lancaster import LancasterStemmer
st=LancasterStemmer()
stemmedWords=[st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)
# In[ ]:
# NLTK has functionality to automatically tag words as nouns, verbs, conjunctions etc
nltk.pos_tag(word_tokenize(text2))
# In[ ]:
# In[ ]:
# In[ ]: