-
Notifications
You must be signed in to change notification settings - Fork 0
/
word-and-document-frequency-tf-idf.html
424 lines (382 loc) · 31.8 KB
/
word-and-document-frequency-tf-idf.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
<!DOCTYPE html>
<html lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<title>Chapter 7 Word and document frequency (TF-IDF) | Natural Language Processing with R</title>
<meta name="description" content="This is a tutorial of various techniques used in natural language processing and text mining." />
<meta name="generator" content="bookdown 0.18 and GitBook 2.6.7" />
<meta property="og:title" content="Chapter 7 Word and document frequency (TF-IDF) | Natural Language Processing with R" />
<meta property="og:type" content="book" />
<meta property="og:description" content="This is a tutorial of various techniques used in natural language processing and text mining." />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Chapter 7 Word and document frequency (TF-IDF) | Natural Language Processing with R" />
<meta name="twitter:description" content="This is a tutorial of various techniques used in natural language processing and text mining." />
<meta name="author" content="Saif SHabou" />
<meta name="date" content="2020-05-06" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<link rel="prev" href="sentiment-analysis.html"/>
<link rel="next" href="topic-modeling.html"/>
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />
<style type="text/css">
a.sourceLine { display: inline-block; line-height: 1.25; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
a.sourceLine:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource a.sourceLine
{ position: relative; left: -4em; }
pre.numberSource a.sourceLine::before
{ content: attr(title);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; pointer-events: all; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
a.sourceLine::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
<link rel="stylesheet" href="style.css" type="text/css" />
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li><a href="./">NLP with R</a></li>
<li class="divider"></li>
<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Introduction</a></li>
<li class="chapter" data-level="2" data-path="text-processing.html"><a href="text-processing.html"><i class="fa fa-check"></i><b>2</b> Text processing</a><ul>
<li class="chapter" data-level="2.1" data-path="text-processing.html"><a href="text-processing.html#text-data"><i class="fa fa-check"></i><b>2.1</b> Text data</a></li>
<li class="chapter" data-level="2.2" data-path="text-processing.html"><a href="text-processing.html#nlp-applications"><i class="fa fa-check"></i><b>2.2</b> NLP applications</a></li>
<li class="chapter" data-level="2.3" data-path="text-processing.html"><a href="text-processing.html#tokenization"><i class="fa fa-check"></i><b>2.3</b> Tokenization</a></li>
<li class="chapter" data-level="2.4" data-path="text-processing.html"><a href="text-processing.html#stop-words-handeling"><i class="fa fa-check"></i><b>2.4</b> Stop words handeling</a></li>
<li class="chapter" data-level="2.5" data-path="text-processing.html"><a href="text-processing.html#words-frequencies"><i class="fa fa-check"></i><b>2.5</b> Words frequencies</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="Word-embeddings.html"><a href="Word-embeddings.html"><i class="fa fa-check"></i><b>3</b> Word embeddings</a><ul>
<li class="chapter" data-level="3.1" data-path="Word-embeddings.html"><a href="Word-embeddings.html#vectorizing-text"><i class="fa fa-check"></i><b>3.1</b> Vectorizing text</a></li>
<li class="chapter" data-level="3.2" data-path="Word-embeddings.html"><a href="Word-embeddings.html#one-hot-encoding"><i class="fa fa-check"></i><b>3.2</b> One-hot encoding</a></li>
<li class="chapter" data-level="3.3" data-path="Word-embeddings.html"><a href="Word-embeddings.html#word-embeddings-methods"><i class="fa fa-check"></i><b>3.3</b> Word embeddings methods</a><ul>
<li class="chapter" data-level="3.3.1" data-path="Word-embeddings.html"><a href="Word-embeddings.html#learn-world-embeddings"><i class="fa fa-check"></i><b>3.3.1</b> Learn world embeddings</a></li>
<li class="chapter" data-level="3.3.2" data-path="Word-embeddings.html"><a href="Word-embeddings.html#pre-trained-word-embeddings"><i class="fa fa-check"></i><b>3.3.2</b> Pre-trained word embeddings</a></li>
</ul></li>
<li class="chapter" data-level="3.4" data-path="Word-embeddings.html"><a href="Word-embeddings.html#applications"><i class="fa fa-check"></i><b>3.4</b> Applications</a><ul>
<li class="chapter" data-level="3.4.1" data-path="Word-embeddings.html"><a href="Word-embeddings.html#using-skip-gram"><i class="fa fa-check"></i><b>3.4.1</b> Using Skip-Gram</a></li>
<li class="chapter" data-level="3.4.2" data-path="Word-embeddings.html"><a href="Word-embeddings.html#using-glove"><i class="fa fa-check"></i><b>3.4.2</b> Using GloVe</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="Word-embeddings.html"><a href="Word-embeddings.html#references"><i class="fa fa-check"></i><b>3.5</b> references</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="text-classification.html"><a href="text-classification.html"><i class="fa fa-check"></i><b>4</b> Text classification</a><ul>
<li class="chapter" data-level="4.1" data-path="text-classification.html"><a href="text-classification.html#load-the-data"><i class="fa fa-check"></i><b>4.1</b> Load the data</a></li>
<li class="chapter" data-level="4.2" data-path="text-classification.html"><a href="text-classification.html#prepare-the-data-for-neural-network"><i class="fa fa-check"></i><b>4.2</b> Prepare the data for neural network</a></li>
<li class="chapter" data-level="4.3" data-path="text-classification.html"><a href="text-classification.html#building-the-model"><i class="fa fa-check"></i><b>4.3</b> Building the model</a></li>
<li class="chapter" data-level="4.4" data-path="text-classification.html"><a href="text-classification.html#testing-the-model"><i class="fa fa-check"></i><b>4.4</b> Testing the model</a></li>
<li class="chapter" data-level="4.5" data-path="text-classification.html"><a href="text-classification.html#reference"><i class="fa fa-check"></i><b>4.5</b> Reference</a></li>
</ul></li>
<li class="chapter" data-level="5" data-path="RNN.html"><a href="RNN.html"><i class="fa fa-check"></i><b>5</b> Reccurent Neural Networks (RNN)</a><ul>
<li class="chapter" data-level="5.1" data-path="RNN.html"><a href="RNN.html#understanding-recurrent-neural-network"><i class="fa fa-check"></i><b>5.1</b> Understanding Recurrent Neural Network</a></li>
<li class="chapter" data-level="5.2" data-path="RNN.html"><a href="RNN.html#rnn-with-keras"><i class="fa fa-check"></i><b>5.2</b> RNN with Keras</a></li>
<li class="chapter" data-level="5.3" data-path="RNN.html"><a href="RNN.html#lstm-with-keras"><i class="fa fa-check"></i><b>5.3</b> LSTM with Keras</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html"><i class="fa fa-check"></i><b>6</b> Sentiment Analysis</a><ul>
<li class="chapter" data-level="6.1" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html#the-sentiments-dataset"><i class="fa fa-check"></i><b>6.1</b> The “Sentiments” dataset</a></li>
<li class="chapter" data-level="6.2" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html#application"><i class="fa fa-check"></i><b>6.2</b> Application</a></li>
<li class="chapter" data-level="6.3" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html#references-1"><i class="fa fa-check"></i><b>6.3</b> References:</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html"><i class="fa fa-check"></i><b>7</b> Word and document frequency (TF-IDF)</a><ul>
<li class="chapter" data-level="7.1" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html#term-frequency-application"><i class="fa fa-check"></i><b>7.1</b> Term frequency application</a></li>
<li class="chapter" data-level="7.2" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html#zipfs-law"><i class="fa fa-check"></i><b>7.2</b> Zipf’s law</a></li>
<li class="chapter" data-level="7.3" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html#tf_idf-metric"><i class="fa fa-check"></i><b>7.3</b> TF_IDF metric</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="topic-modeling.html"><a href="topic-modeling.html"><i class="fa fa-check"></i><b>8</b> Topic modeling</a><ul>
<li class="chapter" data-level="8.1" data-path="topic-modeling.html"><a href="topic-modeling.html#latent-dirichlet-allocation"><i class="fa fa-check"></i><b>8.1</b> Latent Dirichlet allocation</a></li>
<li class="chapter" data-level="8.2" data-path="topic-modeling.html"><a href="topic-modeling.html#document-topic-probabilities"><i class="fa fa-check"></i><b>8.2</b> Document-topic probabilities</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html"><i class="fa fa-check"></i><b>9</b> Words’ relationships analysis</a><ul>
<li class="chapter" data-level="9.1" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html#extracting-bi-grams"><i class="fa fa-check"></i><b>9.1</b> Extracting bi-grams</a></li>
<li class="chapter" data-level="9.2" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html#analyzing-bi-grams"><i class="fa fa-check"></i><b>9.2</b> Analyzing bi-grams</a></li>
<li class="chapter" data-level="9.3" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html#visualizing-a-network-of-bigrams"><i class="fa fa-check"></i><b>9.3</b> Visualizing a network of bigrams</a></li>
</ul></li>
<li class="chapter" data-level="10" data-path="document-term-matrix.html"><a href="document-term-matrix.html"><i class="fa fa-check"></i><b>10</b> Document-term matrix</a><ul>
<li class="chapter" data-level="10.1" data-path="document-term-matrix.html"><a href="document-term-matrix.html#converting-dtm-into-dataframe"><i class="fa fa-check"></i><b>10.1</b> COnverting DTM into dataframe</a></li>
<li class="chapter" data-level="10.2" data-path="document-term-matrix.html"><a href="document-term-matrix.html#generating-document-term-matrix"><i class="fa fa-check"></i><b>10.2</b> Generating Document-term matrix</a></li>
</ul></li>
<li class="divider"></li>
<li><a href="https://github.com/rstudio/bookdown" target="blank">Published with bookdown</a></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Natural Language Processing with R</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="word-and-document-frequency-tf-idf" class="section level1">
<h1><span class="header-section-number">Chapter 7</span> Word and document frequency (TF-IDF)</h1>
<p>One major question in text mining and natural langiage procesing is to quantify what a document is about using the words it contains. In addition to measuring “term frequency” metric (tf), we can look at the term’s <em>inverse document frequency</em> (idf). The <em>idf</em> decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents. This metric can be combined with the term frequency to claculate a term’s <em>tf-idf</em>: the frequency of a term adjusted for how rarely is is used.</p>
<p><span class="math inline">\(idf(term) = ln (\frac{n_{documents}}{n_{documents containing term}})\)</span></p>
<div id="term-frequency-application" class="section level2">
<h2><span class="header-section-number">7.1</span> Term frequency application</h2>
<p>Let’s count the term frequency in Jane Austen’s novels</p>
<div class="sourceCode" id="cb126"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb126-1" title="1"><span class="kw">library</span>(dplyr)</a>
<a class="sourceLine" id="cb126-2" title="2"><span class="kw">library</span>(janeaustenr)</a>
<a class="sourceLine" id="cb126-3" title="3"><span class="kw">library</span>(tidytext)</a>
<a class="sourceLine" id="cb126-4" title="4"></a>
<a class="sourceLine" id="cb126-5" title="5"><span class="co"># count term frequency in each book</span></a>
<a class="sourceLine" id="cb126-6" title="6">book_words =<span class="st"> </span><span class="kw">austen_books</span>() <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb126-7" title="7"><span class="st"> </span><span class="kw">unnest_tokens</span>(word, text) <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb126-8" title="8"><span class="st"> </span><span class="kw">count</span>(book, word, <span class="dt">sort =</span> <span class="ot">TRUE</span>)</a>
<a class="sourceLine" id="cb126-9" title="9"></a>
<a class="sourceLine" id="cb126-10" title="10"><span class="co"># count number of terms in each book</span></a>
<a class="sourceLine" id="cb126-11" title="11">total_words =<span class="st"> </span>book_words <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb126-12" title="12"><span class="st"> </span><span class="kw">group_by</span>(book) <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb126-13" title="13"><span class="st"> </span><span class="kw">summarize</span>(<span class="dt">total =</span> <span class="kw">sum</span>(n))</a>
<a class="sourceLine" id="cb126-14" title="14"><span class="co">#join both</span></a>
<a class="sourceLine" id="cb126-15" title="15">book_words =<span class="st"> </span><span class="kw">left_join</span>(book_words, total_words)</a></code></pre></div>
<pre><code>## Joining, by = "book"</code></pre>
<div class="sourceCode" id="cb128"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb128-1" title="1">book_words</a></code></pre></div>
<pre><code>## # A tibble: 40,379 x 4
## book word n total
## <fct> <chr> <int> <int>
## 1 Mansfield Park the 6206 160460
## 2 Mansfield Park to 5475 160460
## 3 Mansfield Park and 5438 160460
## 4 Emma to 5239 160996
## 5 Emma the 5201 160996
## 6 Emma and 4896 160996
## 7 Mansfield Park of 4778 160460
## 8 Pride & Prejudice the 4331 122204
## 9 Emma of 4291 160996
## 10 Pride & Prejudice to 4162 122204
## # ... with 40,369 more rows</code></pre>
<p>The resulting table contains one word/book by row: <code>n</code> is the number of times the word is used in a specific book and <code>total</code> is the toal words in the book. Let’s look at the distribution of <code>n/total</code> for each novel. It represents the number of times a word appears iin a novel diveded by the ttal number of terms: the term frequency.</p>
<div class="sourceCode" id="cb130"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb130-1" title="1"><span class="kw">library</span>(ggplot2)</a>
<a class="sourceLine" id="cb130-2" title="2"></a>
<a class="sourceLine" id="cb130-3" title="3"><span class="kw">ggplot</span>(<span class="dt">data =</span> book_words, <span class="kw">aes</span>(n<span class="op">/</span>total, <span class="dt">fill =</span> book)) <span class="op">+</span></a>
<a class="sourceLine" id="cb130-4" title="4"><span class="st"> </span><span class="kw">geom_histogram</span>(<span class="dt">show.legend =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a>
<a class="sourceLine" id="cb130-5" title="5"><span class="st"> </span><span class="kw">xlim</span>(<span class="ot">NA</span>, <span class="fl">0.0009</span>) <span class="op">+</span></a>
<a class="sourceLine" id="cb130-6" title="6"><span class="st"> </span><span class="kw">facet_wrap</span>(<span class="op">~</span>book, <span class="dt">ncol =</span> <span class="dv">2</span>, <span class="dt">scales =</span> <span class="st">"free_y"</span>)</a></code></pre></div>
<pre><code>## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.</code></pre>
<pre><code>## Warning: Removed 896 rows containing non-finite values (stat_bin).</code></pre>
<pre><code>## Warning: Removed 6 rows containing missing values (geom_bar).</code></pre>
<p><img src="NLP-book_files/figure-html/unnamed-chunk-12-1.png" width="672" /></p>
</div>
<div id="zipfs-law" class="section level2">
<h2><span class="header-section-number">7.2</span> Zipf’s law</h2>
<p>The distribution shon in the previous figure is typical in known as Zipf’s law. It represents the relationships between the frequency of a word and its rank. Zipf’s law states that the frequency that a word appears is inversly proportional to its rank.</p>
<p>We can test this hypothesis with Jane Auste’s novels:</p>
<div class="sourceCode" id="cb134"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb134-1" title="1">freq_by_rank =<span class="st"> </span>book_words <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb134-2" title="2"><span class="st"> </span><span class="kw">group_by</span>(book) <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb134-3" title="3"><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">rank =</span> <span class="kw">row_number</span>(),</a>
<a class="sourceLine" id="cb134-4" title="4"> <span class="st">`</span><span class="dt">term frequency</span><span class="st">`</span> =<span class="st"> </span>n<span class="op">/</span>total)</a>
<a class="sourceLine" id="cb134-5" title="5">freq_by_rank</a></code></pre></div>
<pre><code>## # A tibble: 40,379 x 6
## # Groups: book [6]
## book word n total rank `term frequency`
## <fct> <chr> <int> <int> <int> <dbl>
## 1 Mansfield Park the 6206 160460 1 0.0387
## 2 Mansfield Park to 5475 160460 2 0.0341
## 3 Mansfield Park and 5438 160460 3 0.0339
## 4 Emma to 5239 160996 1 0.0325
## 5 Emma the 5201 160996 2 0.0323
## 6 Emma and 4896 160996 3 0.0304
## 7 Mansfield Park of 4778 160460 4 0.0298
## 8 Pride & Prejudice the 4331 122204 1 0.0354
## 9 Emma of 4291 160996 4 0.0267
## 10 Pride & Prejudice to 4162 122204 2 0.0341
## # ... with 40,369 more rows</code></pre>
<p>In the ibtained dataframe, the <code>rank</code> column represents the rank of rach word within the frequency table (ordered by <code>n</code>). We can visualize the zipf’s law by plotting the rank in the x-axis and term frequency on the y-axis
, on logarithmic scales.</p>
<div class="sourceCode" id="cb136"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb136-1" title="1">freq_by_rank <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb136-2" title="2"><span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(rank, <span class="st">`</span><span class="dt">term frequency</span><span class="st">`</span>, <span class="dt">color =</span> book)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb136-3" title="3"><span class="st"> </span><span class="kw">geom_line</span>(<span class="dt">size =</span> <span class="fl">1.1</span>, <span class="dt">alpha =</span> <span class="fl">0.8</span>, <span class="dt">show.legend =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb136-4" title="4"><span class="st"> </span><span class="kw">scale_x_log10</span>() <span class="op">+</span></a>
<a class="sourceLine" id="cb136-5" title="5"><span class="st"> </span><span class="kw">scale_y_log10</span>()</a></code></pre></div>
<p><img src="NLP-book_files/figure-html/unnamed-chunk-14-1.png" width="672" /></p>
</div>
<div id="tf_idf-metric" class="section level2">
<h2><span class="header-section-number">7.3</span> TF_IDF metric</h2>
<p>TF-IDF (Term frequency-inverse document frequency) is a method for evaluating how important a ord is to a document in a collection or corpus. It consists of decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a corpus of documents.</p>
<p><span class="math inline">\(w_{term,document} = tf_{term,document} log(\frac{total number of documents}{number of documents containing the term})\)</span></p>
<p>Here is an example of measuring Tf-IDF using the <code>bind_tf_idf</code> function</p>
<div class="sourceCode" id="cb137"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb137-1" title="1">book_words <-<span class="st"> </span>book_words <span class="op">%>%</span></a>
<a class="sourceLine" id="cb137-2" title="2"><span class="st"> </span><span class="kw">bind_tf_idf</span>(word, book, n)</a>
<a class="sourceLine" id="cb137-3" title="3"></a>
<a class="sourceLine" id="cb137-4" title="4">book_words</a></code></pre></div>
<pre><code>## # A tibble: 40,379 x 7
## book word n total tf idf tf_idf
## <fct> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Mansfield Park the 6206 160460 0.0387 0 0
## 2 Mansfield Park to 5475 160460 0.0341 0 0
## 3 Mansfield Park and 5438 160460 0.0339 0 0
## 4 Emma to 5239 160996 0.0325 0 0
## 5 Emma the 5201 160996 0.0323 0 0
## 6 Emma and 4896 160996 0.0304 0 0
## 7 Mansfield Park of 4778 160460 0.0298 0 0
## 8 Pride & Prejudice the 4331 122204 0.0354 0 0
## 9 Emma of 4291 160996 0.0267 0 0
## 10 Pride & Prejudice to 4162 122204 0.0341 0 0
## # ... with 40,369 more rows</code></pre>
<p>We notice that <code>idf</code> and <code>tf-idf</code> scores of commmon words are equivalent to zero since this aproach decreases the weight for common words. The inverse document frequency will be a higher number for words that occur in fewer of the documents in the collection. Let’s look at terms with high tf-idf in Jane Austen’s works</p>
<div class="sourceCode" id="cb139"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb139-1" title="1">book_words <span class="op">%>%</span></a>
<a class="sourceLine" id="cb139-2" title="2"><span class="st"> </span><span class="kw">select</span>(<span class="op">-</span>total) <span class="op">%>%</span></a>
<a class="sourceLine" id="cb139-3" title="3"><span class="st"> </span><span class="kw">arrange</span>(<span class="kw">desc</span>(tf_idf))</a></code></pre></div>
<pre><code>## # A tibble: 40,379 x 6
## book word n tf idf tf_idf
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 Sense & Sensibility elinor 623 0.00519 1.79 0.00931
## 2 Sense & Sensibility marianne 492 0.00410 1.79 0.00735
## 3 Mansfield Park crawford 493 0.00307 1.79 0.00551
## 4 Pride & Prejudice darcy 373 0.00305 1.79 0.00547
## 5 Persuasion elliot 254 0.00304 1.79 0.00544
## 6 Emma emma 786 0.00488 1.10 0.00536
## 7 Northanger Abbey tilney 196 0.00252 1.79 0.00452
## 8 Emma weston 389 0.00242 1.79 0.00433
## 9 Pride & Prejudice bennet 294 0.00241 1.79 0.00431
## 10 Persuasion wentworth 191 0.00228 1.79 0.00409
## # ... with 40,369 more rows</code></pre>
<p>Here we see all proper nouns, names that are in fact important in these novels. None of them occur in all of novels, and they are important, characteristic words for each text within the corpus of Jane Austen’s novels.</p>
<p>Let’s plot the results:</p>
<div class="sourceCode" id="cb141"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb141-1" title="1">book_words <span class="op">%>%</span></a>
<a class="sourceLine" id="cb141-2" title="2"><span class="st"> </span><span class="kw">arrange</span>(<span class="kw">desc</span>(tf_idf)) <span class="op">%>%</span></a>
<a class="sourceLine" id="cb141-3" title="3"><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">word =</span> <span class="kw">factor</span>(word, <span class="dt">levels =</span> <span class="kw">rev</span>(<span class="kw">unique</span>(word)))) <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb141-4" title="4"><span class="st"> </span><span class="kw">group_by</span>(book) <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb141-5" title="5"><span class="st"> </span><span class="kw">top_n</span>(<span class="dv">15</span>) <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb141-6" title="6"><span class="st"> </span><span class="kw">ungroup</span>() <span class="op">%>%</span></a>
<a class="sourceLine" id="cb141-7" title="7"><span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(word, tf_idf, <span class="dt">fill =</span> book)) <span class="op">+</span></a>
<a class="sourceLine" id="cb141-8" title="8"><span class="st"> </span><span class="kw">geom_col</span>(<span class="dt">show.legend =</span> <span class="ot">FALSE</span>) <span class="op">+</span></a>
<a class="sourceLine" id="cb141-9" title="9"><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="ot">NULL</span>, <span class="dt">y =</span> <span class="st">"tf-idf"</span>) <span class="op">+</span></a>
<a class="sourceLine" id="cb141-10" title="10"><span class="st"> </span><span class="kw">facet_wrap</span>(<span class="op">~</span>book, <span class="dt">ncol =</span> <span class="dv">2</span>, <span class="dt">scales =</span> <span class="st">"free"</span>) <span class="op">+</span></a>
<a class="sourceLine" id="cb141-11" title="11"><span class="st"> </span><span class="kw">coord_flip</span>()</a></code></pre></div>
<pre><code>## Selecting by tf_idf</code></pre>
<p><img src="NLP-book_files/figure-html/unnamed-chunk-17-1.png" width="672" /></p>
</div>
</div>
</section>
</div>
</div>
</div>
<a href="sentiment-analysis.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="topic-modeling.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": ["NLP-book.pdf", "NLP-book.epub"],
"toc": {
"collapse": "subsection"
}
});
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
var src = "true";
if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
if (location.protocol !== "file:")
if (/^https?:/.test(src))
src = src.replace(/^https?:/, '');
script.src = src;
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>