1
+ {
2
+ "metadata" : {
3
+ "language_info" : {
4
+ "codemirror_mode" : {
5
+ "name" : " ipython" ,
6
+ "version" : 3
7
+ },
8
+ "file_extension" : " .py" ,
9
+ "mimetype" : " text/x-python" ,
10
+ "name" : " python" ,
11
+ "nbconvert_exporter" : " python" ,
12
+ "pygments_lexer" : " ipython3" ,
13
+ "version" : " 3.8.2-final"
14
+ },
15
+ "orig_nbformat" : 2 ,
16
+ "kernelspec" : {
17
+ "name" : " python3" ,
18
+ "display_name" : " Python 3.8.2 64-bit" ,
19
+ "metadata" : {
20
+ "interpreter" : {
21
+ "hash" : " 31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
22
+ }
23
+ }
24
+ }
25
+ },
26
+ "nbformat" : 4 ,
27
+ "nbformat_minor" : 2 ,
28
+ "cells" : [
29
+ {
30
+ "cell_type" : " code" ,
31
+ "execution_count" : null ,
32
+ "metadata" : {},
33
+ "outputs" : [],
34
+ "source" : [
35
+ " !pip3 install --user --upgrade pip \n " ,
36
+ " !pip3 install --user nltk"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type" : " code" ,
41
+ "execution_count" : null ,
42
+ "metadata" : {},
43
+ "outputs" : [],
44
+ "source" : [
45
+ " import nltk\n " ,
46
+ " nltk.download('wordnet')\n " ,
47
+ " nltk.download('punkt')\n " ,
48
+ " nltk.download('averaged_perceptron_tagger')\n " ,
49
+ " nltk.download('gutenberg')\n " ,
50
+ " nltk.download('stopwords')"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type" : " code" ,
55
+ "execution_count" : null ,
56
+ "metadata" : {
57
+ "tags" : []
58
+ },
59
+ "outputs" : [],
60
+ "source" : [
61
+ " from nltk.tokenize import word_tokenize\n " ,
62
+ " from nltk.tokenize import sent_tokenize \n " ,
63
+ " \n " ,
64
+ " corpus = '''Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.'''\n " ,
65
+ " \n " ,
66
+ " print(sent_tokenize(corpus))\n " ,
67
+ " print(word_tokenize(corpus))"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type" : " code" ,
72
+ "execution_count" : null ,
73
+ "metadata" : {},
74
+ "outputs" : [],
75
+ "source" : [
76
+ " from nltk.stem import WordNetLemmatizer\n " ,
77
+ " \n " ,
78
+ " corpus = ['rocks', 'gone', 'better']\n " ,
79
+ " lemmatizer = WordNetLemmatizer()\n " ,
80
+ " \n " ,
81
+ " print([lemmatizer.lemmatize(w) for w in corpus])"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type" : " code" ,
86
+ "execution_count" : null ,
87
+ "metadata" : {},
88
+ "outputs" : [],
89
+ "source" : [
90
+ " from nltk import pos_tag\n " ,
91
+ " \n " ,
92
+ " def lemmatize_sent(text): \n " ,
93
+ " pos_dict = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}\n " ,
94
+ " word_list = []\n " ,
95
+ " for word, tag in pos_tag(word_tokenize(text)):\n " ,
96
+ " pos = pos_dict[tag[0:2]] if tag[0:2] in pos_dict else 'n'\n " ,
97
+ " word_list.append(lemmatizer.lemmatize(word, pos=pos))\n " ,
98
+ " return word_list\n " ,
99
+ " \n " ,
100
+ " sentence = 'He is walking to school'\n " ,
101
+ " print('lemmatize word by word: ', [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)])\n " ,
102
+ " print('lemmatize with context: ', lemmatize_sent(sentence))"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type" : " code" ,
107
+ "execution_count" : null ,
108
+ "metadata" : {},
109
+ "outputs" : [],
110
+ "source" : [
111
+ " from nltk.stem import PorterStemmer\n " ,
112
+ " \n " ,
113
+ " corpus = ['rocks', 'going', 'history']\n " ,
114
+ " stemmer = PorterStemmer()\n " ,
115
+ " print([stemmer.stem(w) for w in corpus])"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type" : " code" ,
120
+ "execution_count" : null ,
121
+ "metadata" : {},
122
+ "outputs" : [],
123
+ "source" : [
124
+ " from nltk.corpus import gutenberg\n " ,
125
+ " import time\n " ,
126
+ " \n " ,
127
+ " def timing(func):\n " ,
128
+ " def decorate(*args, **kwargs):\n " ,
129
+ " start = time.time()\n " ,
130
+ " func(*args, **kwargs)\n " ,
131
+ " print(\" %-30s: %-7.2f ms\" % (func.__name__, (time.time() - start) * 1000))\n " ,
132
+ " return decorate\n " ,
133
+ " \n " ,
134
+ " @timing\n " ,
135
+ " def stemming(text):\n " ,
136
+ " [stemmer.stem(w) for w in word_tokenize(sentence)]\n " ,
137
+ " \n " ,
138
+ " @timing\n " ,
139
+ " def lemmatize(text):\n " ,
140
+ " lemmatize_sent(text)\n " ,
141
+ " \n " ,
142
+ " @timing\n " ,
143
+ " def lemmatize_without_context(text): \n " ,
144
+ " [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)]\n " ,
145
+ " \n " ,
146
+ " book = gutenberg.raw(\" austen-sense.txt\" )\n " ,
147
+ " \n " ,
148
+ " stemming(book)\n " ,
149
+ " lemmatize(book)\n " ,
150
+ " lemmatize_without_context(book)"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type" : " code" ,
155
+ "execution_count" : null ,
156
+ "metadata" : {},
157
+ "outputs" : [],
158
+ "source" : [
159
+ " from nltk.corpus import stopwords\n " ,
160
+ " \n " ,
161
+ " corpus = ['I', 'am', 'a', 'boy']\n " ,
162
+ " print([w for w in corpus if w not in set(stopwords.words('english'))])"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type" : " code" ,
167
+ "execution_count" : null ,
168
+ "metadata" : {},
169
+ "outputs" : [],
170
+ "source" : [
171
+ " !pip3 install --user scikit-learn"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type" : " code" ,
176
+ "execution_count" : null ,
177
+ "metadata" : {},
178
+ "outputs" : [],
179
+ "source" : [
180
+ " from sklearn.feature_extraction.text import CountVectorizer\n " ,
181
+ " \n " ,
182
+ " vectorizer = CountVectorizer()\n " ,
183
+ " corpus = [\n " ,
184
+ " 'He is a teacher',\n " ,
185
+ " 'I am student',\n " ,
186
+ " 'She is also a student',\n " ,
187
+ " ]\n " ,
188
+ " X = vectorizer.fit_transform(corpus)\n " ,
189
+ " \n " ,
190
+ " print(vectorizer.get_feature_names())\n " ,
191
+ " print(list(vectorizer.stop_words_))\n " ,
192
+ " print(X.toarray())"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type" : " code" ,
197
+ "execution_count" : null ,
198
+ "metadata" : {},
199
+ "outputs" : [],
200
+ "source" : [
201
+ " from sklearn.feature_extraction.text import TfidfVectorizer\n " ,
202
+ " \n " ,
203
+ " vectorizer = TfidfVectorizer()\n " ,
204
+ " corpus = [\n " ,
205
+ " 'He is a teacher',\n " ,
206
+ " 'I am student',\n " ,
207
+ " 'She is also a student',\n " ,
208
+ " ]\n " ,
209
+ " X = vectorizer.fit_transform(corpus)\n " ,
210
+ " \n " ,
211
+ " print(vectorizer.get_feature_names())\n " ,
212
+ " print(X.toarray())"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type" : " code" ,
217
+ "execution_count" : null ,
218
+ "metadata" : {},
219
+ "outputs" : [],
220
+ "source" : [
221
+ " !pip3 install --user numpy"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type" : " code" ,
226
+ "execution_count" : null ,
227
+ "metadata" : {},
228
+ "outputs" : [],
229
+ "source" : [
230
+ " import numpy as np\n " ,
231
+ " \n " ,
232
+ " #The vector calculation of the sentence He is a teacher.\n " ,
233
+ " tfidf_he = 1/3 * (np.log((3+1)/(1+1))+1)\n " ,
234
+ " tfidf_is = 1/3 * (np.log((3+1)/(2+1))+1)\n " ,
235
+ " tfidf_teacher = 1/3 * (np.log((3+1)/(1+1))+1)\n " ,
236
+ " \n " ,
237
+ " print(np.divide([tfidf_he, tfidf_is, tfidf_teacher], np.sqrt(tfidf_he*tfidf_he + tfidf_is*tfidf_is + tfidf_teacher* tfidf_teacher)))"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type" : " code" ,
242
+ "execution_count" : null ,
243
+ "metadata" : {},
244
+ "outputs" : [],
245
+ "source" : [
246
+ " !pip3 install --user gensim"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type" : " code" ,
251
+ "execution_count" : null ,
252
+ "metadata" : {},
253
+ "outputs" : [],
254
+ "source" : [
255
+ " import gensim.downloader\n " ,
256
+ " from gensim.models import Word2Vec\n " ,
257
+ " \n " ,
258
+ " word2vec = gensim.downloader.load('word2vec-google-news-300')"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type" : " code" ,
263
+ "execution_count" : null ,
264
+ "metadata" : {},
265
+ "outputs" : [],
266
+ "source" : [
267
+ " print(word2vec.most_similar('car'))\n " ,
268
+ " print(word2vec.word_vec('car'))\n "
269
+ ]
270
+ },
271
+ {
272
+ "cell_type" : " code" ,
273
+ "execution_count" : null ,
274
+ "metadata" : {},
275
+ "outputs" : [],
276
+ "source" : [
277
+ " !pip3 install tensorflow"
278
+ ]
279
+ }
280
+ ]
281
+ }
0 commit comments