Skip to content

Commit 9d01911

Browse files
committed
Add an notebook
0 parents  commit 9d01911

File tree

1 file changed

+281
-0
lines changed

1 file changed

+281
-0
lines changed
Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
{
2+
"metadata": {
3+
"language_info": {
4+
"codemirror_mode": {
5+
"name": "ipython",
6+
"version": 3
7+
},
8+
"file_extension": ".py",
9+
"mimetype": "text/x-python",
10+
"name": "python",
11+
"nbconvert_exporter": "python",
12+
"pygments_lexer": "ipython3",
13+
"version": "3.8.2-final"
14+
},
15+
"orig_nbformat": 2,
16+
"kernelspec": {
17+
"name": "python3",
18+
"display_name": "Python 3.8.2 64-bit",
19+
"metadata": {
20+
"interpreter": {
21+
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
22+
}
23+
}
24+
}
25+
},
26+
"nbformat": 4,
27+
"nbformat_minor": 2,
28+
"cells": [
29+
{
30+
"cell_type": "code",
31+
"execution_count": null,
32+
"metadata": {},
33+
"outputs": [],
34+
"source": [
35+
"!pip3 install --user --upgrade pip \n",
36+
"!pip3 install --user nltk"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"import nltk\n",
46+
"nltk.download('wordnet')\n",
47+
"nltk.download('punkt')\n",
48+
"nltk.download('averaged_perceptron_tagger')\n",
49+
"nltk.download('gutenberg')\n",
50+
"nltk.download('stopwords')"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": null,
56+
"metadata": {
57+
"tags": []
58+
},
59+
"outputs": [],
60+
"source": [
61+
"from nltk.tokenize import word_tokenize\n",
62+
"from nltk.tokenize import sent_tokenize \n",
63+
"\n",
64+
"corpus = '''Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.'''\n",
65+
"\n",
66+
"print(sent_tokenize(corpus))\n",
67+
"print(word_tokenize(corpus))"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": null,
73+
"metadata": {},
74+
"outputs": [],
75+
"source": [
76+
"from nltk.stem import WordNetLemmatizer\n",
77+
"\n",
78+
"corpus = ['rocks', 'gone', 'better']\n",
79+
"lemmatizer = WordNetLemmatizer()\n",
80+
"\n",
81+
"print([lemmatizer.lemmatize(w) for w in corpus])"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": null,
87+
"metadata": {},
88+
"outputs": [],
89+
"source": [
90+
"from nltk import pos_tag\n",
91+
"\n",
92+
"def lemmatize_sent(text): \n",
93+
" pos_dict = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}\n",
94+
" word_list = []\n",
95+
" for word, tag in pos_tag(word_tokenize(text)):\n",
96+
" pos = pos_dict[tag[0:2]] if tag[0:2] in pos_dict else 'n'\n",
97+
" word_list.append(lemmatizer.lemmatize(word, pos=pos))\n",
98+
" return word_list\n",
99+
"\n",
100+
"sentence = 'He is walking to school'\n",
101+
"print('lemmatize word by word: ', [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)])\n",
102+
"print('lemmatize with context: ', lemmatize_sent(sentence))"
103+
]
104+
},
105+
{
106+
"cell_type": "code",
107+
"execution_count": null,
108+
"metadata": {},
109+
"outputs": [],
110+
"source": [
111+
"from nltk.stem import PorterStemmer\n",
112+
"\n",
113+
"corpus = ['rocks', 'going', 'history']\n",
114+
"stemmer = PorterStemmer()\n",
115+
"print([stemmer.stem(w) for w in corpus])"
116+
]
117+
},
118+
{
119+
"cell_type": "code",
120+
"execution_count": null,
121+
"metadata": {},
122+
"outputs": [],
123+
"source": [
124+
"from nltk.corpus import gutenberg\n",
125+
"import time\n",
126+
" \n",
127+
"def timing(func):\n",
128+
" def decorate(*args, **kwargs):\n",
129+
" start = time.time()\n",
130+
" func(*args, **kwargs)\n",
131+
" print(\"%-30s: %-7.2f ms\" % (func.__name__, (time.time() - start) * 1000))\n",
132+
" return decorate\n",
133+
"\n",
134+
"@timing\n",
135+
"def stemming(text):\n",
136+
" [stemmer.stem(w) for w in word_tokenize(sentence)]\n",
137+
"\n",
138+
"@timing\n",
139+
"def lemmatize(text):\n",
140+
" lemmatize_sent(text)\n",
141+
"\n",
142+
"@timing\n",
143+
"def lemmatize_without_context(text): \n",
144+
" [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)]\n",
145+
"\n",
146+
"book = gutenberg.raw(\"austen-sense.txt\")\n",
147+
"\n",
148+
"stemming(book)\n",
149+
"lemmatize(book)\n",
150+
"lemmatize_without_context(book)"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"metadata": {},
157+
"outputs": [],
158+
"source": [
159+
"from nltk.corpus import stopwords\n",
160+
"\n",
161+
"corpus = ['I', 'am', 'a', 'boy']\n",
162+
"print([w for w in corpus if w not in set(stopwords.words('english'))])"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": null,
168+
"metadata": {},
169+
"outputs": [],
170+
"source": [
171+
"!pip3 install --user scikit-learn"
172+
]
173+
},
174+
{
175+
"cell_type": "code",
176+
"execution_count": null,
177+
"metadata": {},
178+
"outputs": [],
179+
"source": [
180+
"from sklearn.feature_extraction.text import CountVectorizer\n",
181+
"\n",
182+
"vectorizer = CountVectorizer()\n",
183+
"corpus = [\n",
184+
" 'He is a teacher',\n",
185+
" 'I am student',\n",
186+
" 'She is also a student',\n",
187+
"]\n",
188+
"X = vectorizer.fit_transform(corpus)\n",
189+
"\n",
190+
"print(vectorizer.get_feature_names())\n",
191+
"print(list(vectorizer.stop_words_))\n",
192+
"print(X.toarray())"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": null,
198+
"metadata": {},
199+
"outputs": [],
200+
"source": [
201+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
202+
"\n",
203+
"vectorizer = TfidfVectorizer()\n",
204+
"corpus = [\n",
205+
" 'He is a teacher',\n",
206+
" 'I am student',\n",
207+
" 'She is also a student',\n",
208+
"]\n",
209+
"X = vectorizer.fit_transform(corpus)\n",
210+
"\n",
211+
"print(vectorizer.get_feature_names())\n",
212+
"print(X.toarray())"
213+
]
214+
},
215+
{
216+
"cell_type": "code",
217+
"execution_count": null,
218+
"metadata": {},
219+
"outputs": [],
220+
"source": [
221+
"!pip3 install --user numpy"
222+
]
223+
},
224+
{
225+
"cell_type": "code",
226+
"execution_count": null,
227+
"metadata": {},
228+
"outputs": [],
229+
"source": [
230+
"import numpy as np\n",
231+
"\n",
232+
"#The vector calculation of the sentence He is a teacher.\n",
233+
"tfidf_he = 1/3 * (np.log((3+1)/(1+1))+1)\n",
234+
"tfidf_is = 1/3 * (np.log((3+1)/(2+1))+1)\n",
235+
"tfidf_teacher = 1/3 * (np.log((3+1)/(1+1))+1)\n",
236+
"\n",
237+
"print(np.divide([tfidf_he, tfidf_is, tfidf_teacher], np.sqrt(tfidf_he*tfidf_he + tfidf_is*tfidf_is + tfidf_teacher* tfidf_teacher)))"
238+
]
239+
},
240+
{
241+
"cell_type": "code",
242+
"execution_count": null,
243+
"metadata": {},
244+
"outputs": [],
245+
"source": [
246+
"!pip3 install --user gensim"
247+
]
248+
},
249+
{
250+
"cell_type": "code",
251+
"execution_count": null,
252+
"metadata": {},
253+
"outputs": [],
254+
"source": [
255+
"import gensim.downloader\n",
256+
"from gensim.models import Word2Vec\n",
257+
"\n",
258+
"word2vec = gensim.downloader.load('word2vec-google-news-300')"
259+
]
260+
},
261+
{
262+
"cell_type": "code",
263+
"execution_count": null,
264+
"metadata": {},
265+
"outputs": [],
266+
"source": [
267+
"print(word2vec.most_similar('car'))\n",
268+
"print(word2vec.word_vec('car'))\n"
269+
]
270+
},
271+
{
272+
"cell_type": "code",
273+
"execution_count": null,
274+
"metadata": {},
275+
"outputs": [],
276+
"source": [
277+
"!pip3 install tensorflow"
278+
]
279+
}
280+
]
281+
}

0 commit comments

Comments
 (0)