-
Notifications
You must be signed in to change notification settings - Fork 0
/
Word-embeddings.html
843 lines (801 loc) · 76.7 KB
/
Word-embeddings.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
<!DOCTYPE html>
<html lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<title>Chapter 3 Word embeddings | Natural Language Processing with R</title>
<meta name="description" content="This is a tutorial of various techniques used in natural language processing and text mining." />
<meta name="generator" content="bookdown 0.18 and GitBook 2.6.7" />
<meta property="og:title" content="Chapter 3 Word embeddings | Natural Language Processing with R" />
<meta property="og:type" content="book" />
<meta property="og:description" content="This is a tutorial of various techniques used in natural language processing and text mining." />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Chapter 3 Word embeddings | Natural Language Processing with R" />
<meta name="twitter:description" content="This is a tutorial of various techniques used in natural language processing and text mining." />
<meta name="author" content="Saif SHabou" />
<meta name="date" content="2020-05-06" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<link rel="prev" href="text-processing.html"/>
<link rel="next" href="text-classification.html"/>
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />
<style type="text/css">
a.sourceLine { display: inline-block; line-height: 1.25; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
a.sourceLine:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource a.sourceLine
{ position: relative; left: -4em; }
pre.numberSource a.sourceLine::before
{ content: attr(title);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; pointer-events: all; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
a.sourceLine::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
<link rel="stylesheet" href="style.css" type="text/css" />
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li><a href="./">NLP with R</a></li>
<li class="divider"></li>
<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Introduction</a></li>
<li class="chapter" data-level="2" data-path="text-processing.html"><a href="text-processing.html"><i class="fa fa-check"></i><b>2</b> Text processing</a><ul>
<li class="chapter" data-level="2.1" data-path="text-processing.html"><a href="text-processing.html#text-data"><i class="fa fa-check"></i><b>2.1</b> Text data</a></li>
<li class="chapter" data-level="2.2" data-path="text-processing.html"><a href="text-processing.html#nlp-applications"><i class="fa fa-check"></i><b>2.2</b> NLP applications</a></li>
<li class="chapter" data-level="2.3" data-path="text-processing.html"><a href="text-processing.html#tokenization"><i class="fa fa-check"></i><b>2.3</b> Tokenization</a></li>
<li class="chapter" data-level="2.4" data-path="text-processing.html"><a href="text-processing.html#stop-words-handeling"><i class="fa fa-check"></i><b>2.4</b> Stop words handeling</a></li>
<li class="chapter" data-level="2.5" data-path="text-processing.html"><a href="text-processing.html#words-frequencies"><i class="fa fa-check"></i><b>2.5</b> Words frequencies</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="Word-embeddings.html"><a href="Word-embeddings.html"><i class="fa fa-check"></i><b>3</b> Word embeddings</a><ul>
<li class="chapter" data-level="3.1" data-path="Word-embeddings.html"><a href="Word-embeddings.html#vectorizing-text"><i class="fa fa-check"></i><b>3.1</b> Vectorizing text</a></li>
<li class="chapter" data-level="3.2" data-path="Word-embeddings.html"><a href="Word-embeddings.html#one-hot-encoding"><i class="fa fa-check"></i><b>3.2</b> One-hot encoding</a></li>
<li class="chapter" data-level="3.3" data-path="Word-embeddings.html"><a href="Word-embeddings.html#word-embeddings-methods"><i class="fa fa-check"></i><b>3.3</b> Word embeddings methods</a><ul>
<li class="chapter" data-level="3.3.1" data-path="Word-embeddings.html"><a href="Word-embeddings.html#learn-world-embeddings"><i class="fa fa-check"></i><b>3.3.1</b> Learn world embeddings</a></li>
<li class="chapter" data-level="3.3.2" data-path="Word-embeddings.html"><a href="Word-embeddings.html#pre-trained-word-embeddings"><i class="fa fa-check"></i><b>3.3.2</b> Pre-trained word embeddings</a></li>
</ul></li>
<li class="chapter" data-level="3.4" data-path="Word-embeddings.html"><a href="Word-embeddings.html#applications"><i class="fa fa-check"></i><b>3.4</b> Applications</a><ul>
<li class="chapter" data-level="3.4.1" data-path="Word-embeddings.html"><a href="Word-embeddings.html#using-skip-gram"><i class="fa fa-check"></i><b>3.4.1</b> Using Skip-Gram</a></li>
<li class="chapter" data-level="3.4.2" data-path="Word-embeddings.html"><a href="Word-embeddings.html#using-glove"><i class="fa fa-check"></i><b>3.4.2</b> Using GloVe</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="Word-embeddings.html"><a href="Word-embeddings.html#references"><i class="fa fa-check"></i><b>3.5</b> references</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="text-classification.html"><a href="text-classification.html"><i class="fa fa-check"></i><b>4</b> Text classification</a><ul>
<li class="chapter" data-level="4.1" data-path="text-classification.html"><a href="text-classification.html#load-the-data"><i class="fa fa-check"></i><b>4.1</b> Load the data</a></li>
<li class="chapter" data-level="4.2" data-path="text-classification.html"><a href="text-classification.html#prepare-the-data-for-neural-network"><i class="fa fa-check"></i><b>4.2</b> Prepare the data for neural network</a></li>
<li class="chapter" data-level="4.3" data-path="text-classification.html"><a href="text-classification.html#building-the-model"><i class="fa fa-check"></i><b>4.3</b> Building the model</a></li>
<li class="chapter" data-level="4.4" data-path="text-classification.html"><a href="text-classification.html#testing-the-model"><i class="fa fa-check"></i><b>4.4</b> Testing the model</a></li>
<li class="chapter" data-level="4.5" data-path="text-classification.html"><a href="text-classification.html#reference"><i class="fa fa-check"></i><b>4.5</b> Reference</a></li>
</ul></li>
<li class="chapter" data-level="5" data-path="RNN.html"><a href="RNN.html"><i class="fa fa-check"></i><b>5</b> Reccurent Neural Networks (RNN)</a><ul>
<li class="chapter" data-level="5.1" data-path="RNN.html"><a href="RNN.html#understanding-recurrent-neural-network"><i class="fa fa-check"></i><b>5.1</b> Understanding Recurrent Neural Network</a></li>
<li class="chapter" data-level="5.2" data-path="RNN.html"><a href="RNN.html#rnn-with-keras"><i class="fa fa-check"></i><b>5.2</b> RNN with Keras</a></li>
<li class="chapter" data-level="5.3" data-path="RNN.html"><a href="RNN.html#lstm-with-keras"><i class="fa fa-check"></i><b>5.3</b> LSTM with Keras</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html"><i class="fa fa-check"></i><b>6</b> Sentiment Analysis</a><ul>
<li class="chapter" data-level="6.1" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html#the-sentiments-dataset"><i class="fa fa-check"></i><b>6.1</b> The “Sentiments” dataset</a></li>
<li class="chapter" data-level="6.2" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html#application"><i class="fa fa-check"></i><b>6.2</b> Application</a></li>
<li class="chapter" data-level="6.3" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html#references-1"><i class="fa fa-check"></i><b>6.3</b> References:</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html"><i class="fa fa-check"></i><b>7</b> Word and document frequency (TF-IDF)</a><ul>
<li class="chapter" data-level="7.1" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html#term-frequency-application"><i class="fa fa-check"></i><b>7.1</b> Term frequency application</a></li>
<li class="chapter" data-level="7.2" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html#zipfs-law"><i class="fa fa-check"></i><b>7.2</b> Zipf’s law</a></li>
<li class="chapter" data-level="7.3" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html#tf_idf-metric"><i class="fa fa-check"></i><b>7.3</b> TF_IDF metric</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="topic-modeling.html"><a href="topic-modeling.html"><i class="fa fa-check"></i><b>8</b> Topic modeling</a><ul>
<li class="chapter" data-level="8.1" data-path="topic-modeling.html"><a href="topic-modeling.html#latent-dirichlet-allocation"><i class="fa fa-check"></i><b>8.1</b> Latent Dirichlet allocation</a></li>
<li class="chapter" data-level="8.2" data-path="topic-modeling.html"><a href="topic-modeling.html#document-topic-probabilities"><i class="fa fa-check"></i><b>8.2</b> Document-topic probabilities</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html"><i class="fa fa-check"></i><b>9</b> Words’ relationships analysis</a><ul>
<li class="chapter" data-level="9.1" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html#extracting-bi-grams"><i class="fa fa-check"></i><b>9.1</b> Extracting bi-grams</a></li>
<li class="chapter" data-level="9.2" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html#analyzing-bi-grams"><i class="fa fa-check"></i><b>9.2</b> Analyzing bi-grams</a></li>
<li class="chapter" data-level="9.3" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html#visualizing-a-network-of-bigrams"><i class="fa fa-check"></i><b>9.3</b> Visualizing a network of bigrams</a></li>
</ul></li>
<li class="chapter" data-level="10" data-path="document-term-matrix.html"><a href="document-term-matrix.html"><i class="fa fa-check"></i><b>10</b> Document-term matrix</a><ul>
<li class="chapter" data-level="10.1" data-path="document-term-matrix.html"><a href="document-term-matrix.html#converting-dtm-into-dataframe"><i class="fa fa-check"></i><b>10.1</b> COnverting DTM into dataframe</a></li>
<li class="chapter" data-level="10.2" data-path="document-term-matrix.html"><a href="document-term-matrix.html#generating-document-term-matrix"><i class="fa fa-check"></i><b>10.2</b> Generating Document-term matrix</a></li>
</ul></li>
<li class="divider"></li>
<li><a href="https://github.com/rstudio/bookdown" target="blank">Published with bookdown</a></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Natural Language Processing with R</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="Word-embeddings" class="section level1">
<h1><span class="header-section-number">Chapter 3</span> Word embeddings</h1>
<p>This section is based on this book: <a href="https://github.com/jjallaire/deep-learning-with-r-notebooks" class="uri">https://github.com/jjallaire/deep-learning-with-r-notebooks</a></p>
<div id="vectorizing-text" class="section level2">
<h2><span class="header-section-number">3.1</span> Vectorizing text</h2>
<p>It is the process of transforming text into numeric tensors. It consists of applying some tokenization scheme and then associating numeric vectors with the generated tokens. The generated vectos are packed into sequence tensors and fed into deep neural network.
There are different ways to associate a vector within a token such as <em>one-hot encoding</em> and <em>token embedding</em> (typically used for words and called <em>word embedding</em>).</p>
</div>
<div id="one-hot-encoding" class="section level2">
<h2><span class="header-section-number">3.2</span> One-hot encoding</h2>
<p>It consists of one-hot encoding the words existing in a sentence based on the whole vocabulary.We create a vector with length equal to the vocabulary and we place a one in the index that corresponds to the word existing in the sentences. Then, we can concatenate the one-hot vectors for each word. This method is considered as inefficient since we obtain a sparse one-hot encoded vector (most indices are zero).</p>
<div class="figure">
<img src="images/one-hot.png" alt="One-hot encoding (source:https://www.tensorflow.org/tutorials/text/word_embeddings)" />
<p class="caption">One-hot encoding (source:<a href="https://www.tensorflow.org/tutorials/text/word_embeddings" class="uri">https://www.tensorflow.org/tutorials/text/word_embeddings</a>)</p>
</div>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb18-1" title="1"><span class="kw">library</span>(keras)</a>
<a class="sourceLine" id="cb18-2" title="2">samples <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"The cat sat on the mat."</span>, <span class="st">"The dog ate my homework."</span>)</a>
<a class="sourceLine" id="cb18-3" title="3"><span class="co"># Creates a tokenizer, configured to only take into account the 1,000 </span></a>
<a class="sourceLine" id="cb18-4" title="4"><span class="co"># most common words, then builds the word index.</span></a>
<a class="sourceLine" id="cb18-5" title="5">tokenizer <-<span class="st"> </span><span class="kw">text_tokenizer</span>(<span class="dt">num_words =</span> <span class="dv">1000</span>) <span class="op">%>%</span></a>
<a class="sourceLine" id="cb18-6" title="6"><span class="st"> </span><span class="kw">fit_text_tokenizer</span>(samples)</a>
<a class="sourceLine" id="cb18-7" title="7"><span class="co"># Turns strings into lists of integer indices</span></a>
<a class="sourceLine" id="cb18-8" title="8">sequences <-<span class="st"> </span><span class="kw">texts_to_sequences</span>(tokenizer, samples)</a>
<a class="sourceLine" id="cb18-9" title="9"><span class="co"># You could also directly get the one-hot binary representations. Vectorization </span></a>
<a class="sourceLine" id="cb18-10" title="10"><span class="co"># modes other than one-hot encoding are supported by this tokenizer.</span></a>
<a class="sourceLine" id="cb18-11" title="11">one_hot_results <-<span class="st"> </span><span class="kw">texts_to_matrix</span>(tokenizer, samples, <span class="dt">mode =</span> <span class="st">"binary"</span>)</a>
<a class="sourceLine" id="cb18-12" title="12"><span class="co"># How you can recover the word index that was computed</span></a>
<a class="sourceLine" id="cb18-13" title="13">word_index <-<span class="st"> </span>tokenizer<span class="op">$</span>word_index</a>
<a class="sourceLine" id="cb18-14" title="14"><span class="kw">cat</span>(<span class="st">"Found"</span>, <span class="kw">length</span>(word_index), <span class="st">"unique tokens.</span><span class="ch">\n</span><span class="st">"</span>)</a></code></pre></div>
<pre><code>## Found 9 unique tokens.</code></pre>
</div>
<div id="word-embeddings-methods" class="section level2">
<h2><span class="header-section-number">3.3</span> Word embeddings methods</h2>
<p>The vectors obtained with one-hot encoding are binary, sparse and very high dimensional (same dimensionality of the number of words in the vocabulary). However, “word embeddings” are low-dimensional dense vectors (as oposite to sparse vectors). They are learned from data. They are commonly 256-dimensional, 512 dimensiona, or 1024-dimensional when dealing with large vocabularies.</p>
<p>There are two methods for obtaining word embedings:</p>
<ul>
<li>Learn word embeddings jointly with a specified task (document classification, sentimenta alnaysis…). For this, we start with random word vectors and learn the word vectors in the same way that we learn the weights of a neural network.</li>
<li>Use a “pre-trained” word embeddings and apply it to our specific task</li>
</ul>
<div id="learn-world-embeddings" class="section level3">
<h3><span class="header-section-number">3.3.1</span> Learn world embeddings</h3>
<p>Word embeddings aim t mapping human language into a geometric space in a way that geometric relationships between word vectors reflect the semantic relationships netween the words. For example, synonyms should be embedded into similar word vectors. We expect that geometric distance between any two word vectors represent semantic distance of the associated words. We can site among common meaningful geometric transformations in word embeddings the “gender vectors” and “plural vectors”. For example, by adding a “female vector” to the vector “king”, we obtain the vector “queen”. In the same way, by adding a “plural vector”, we obtain “kings”. It is hard to find the “ideal” word embedding space to perfectly map general human language. Word embedding performance depends on the task we are working on. A word embedding for Ensglish-language movie review sentiment analysis model may look very different from an English-language legal document classification model since the importance of some semantic relationships varies from task to task.
Therefore, it is useful to learn a new embedding space with every new task. Keras offers the possibility of learning embeddings using <code>layer_embedding()</code>.</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb20-1" title="1"><span class="co"># the embedding layer takes at least two arguments:</span></a>
<a class="sourceLine" id="cb20-2" title="2"><span class="co"># - the number of posssible tokens, here 1000</span></a>
<a class="sourceLine" id="cb20-3" title="3"><span class="co"># - the dimensionality of the embeddings, here 64</span></a>
<a class="sourceLine" id="cb20-4" title="4">embedding_layer =<span class="st"> </span><span class="kw">layer_embedding</span>(<span class="dt">input_dim =</span> <span class="dv">1000</span>, <span class="dt">output_dim =</span> <span class="dv">64</span>)</a></code></pre></div>
<p>The <code>embedding_layer</code> is like a dictionary that maps integer indices to dense vectors. It takes as input a 2D tensor of integers, of shape <code>(samples, sequence_length)</code>, where each entry is a sequence of integers. It generates a 3D floating-point tensor, of shape <code>(samples, sequence_length, embedding_dimensionality</code>.</p>
<p>Let’s apply <code>embedding_layer</code> to the IMDB movie-review sentiment prediction task. We will consider only the top 10,000 most common words and cut off the review after only 20 words. The network will learn 8-dimensional embeddings for each of the 10,000 words, turn the input integer sequences (2D integer tensor) into embedded sequences (3D float tensor), flatten the tensor to 2D, and train a single dense layer on top for classification.</p>
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb21-1" title="1"><span class="kw">library</span>(keras)</a>
<a class="sourceLine" id="cb21-2" title="2"><span class="co"># Number of words to consider as features</span></a>
<a class="sourceLine" id="cb21-3" title="3">max_features =<span class="st"> </span><span class="dv">10000</span></a>
<a class="sourceLine" id="cb21-4" title="4"><span class="co"># cut texts after this number of words (among top max_features most common words)</span></a>
<a class="sourceLine" id="cb21-5" title="5">maxlen =<span class="st"> </span><span class="dv">20</span></a>
<a class="sourceLine" id="cb21-6" title="6"><span class="co"># load the data as lists of integers</span></a>
<a class="sourceLine" id="cb21-7" title="7">imdb =<span class="st"> </span><span class="kw">dataset_imdb</span>(<span class="dt">num_words =</span> max_features)</a>
<a class="sourceLine" id="cb21-8" title="8"><span class="kw">c</span>(<span class="kw">c</span>(x_train, y_train), <span class="kw">c</span>(x_test, y_test)) <span class="op">%<-%</span><span class="st"> </span>imdb</a>
<a class="sourceLine" id="cb21-9" title="9"><span class="co"># This turns our lists of integers</span></a>
<a class="sourceLine" id="cb21-10" title="10"><span class="co"># into a 2D integer tensor of shape `(samples, maxlen)`</span></a>
<a class="sourceLine" id="cb21-11" title="11">x_train =<span class="st"> </span><span class="kw">pad_sequences</span>(x_train, <span class="dt">maxlen =</span> maxlen)</a>
<a class="sourceLine" id="cb21-12" title="12">x_test =<span class="st"> </span><span class="kw">pad_sequences</span>(x_test, <span class="dt">maxlen =</span> maxlen)</a></code></pre></div>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb22-1" title="1"><span class="kw">library</span>(keras)</a>
<a class="sourceLine" id="cb22-2" title="2">model =<span class="st"> </span><span class="kw">keras_model_sequential</span>() <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb22-3" title="3"><span class="st"> </span><span class="co"># we specify the maxmum input length to our embedding layer</span></a>
<a class="sourceLine" id="cb22-4" title="4"><span class="st"> </span><span class="co"># so we can later flatten the embedded inputs</span></a>
<a class="sourceLine" id="cb22-5" title="5"><span class="st"> </span><span class="kw">layer_embedding</span>(<span class="dt">input_dim =</span> <span class="dv">10000</span>, <span class="dt">output_dim =</span> <span class="dv">8</span>, <span class="dt">input_length =</span> maxlen) <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb22-6" title="6"><span class="st"> </span><span class="co"># we flatten the 3D tensor of embeddings into a 2D tensor of shape (samples, maxlen * 8)</span></a>
<a class="sourceLine" id="cb22-7" title="7"><span class="st"> </span><span class="kw">layer_flatten</span>() <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb22-8" title="8"><span class="st"> </span><span class="co"># We add the classifier on top </span></a>
<a class="sourceLine" id="cb22-9" title="9"><span class="st"> </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">1</span>, <span class="dt">activation =</span> <span class="st">"sigmoid"</span>)</a>
<a class="sourceLine" id="cb22-10" title="10"></a>
<a class="sourceLine" id="cb22-11" title="11">model <span class="op">%>%</span><span class="st"> </span><span class="kw">compile</span>(</a>
<a class="sourceLine" id="cb22-12" title="12"> <span class="dt">optimizer =</span> <span class="st">"rmsprop"</span>,</a>
<a class="sourceLine" id="cb22-13" title="13"> <span class="dt">loss =</span> <span class="st">"binary_crossentropy"</span>,</a>
<a class="sourceLine" id="cb22-14" title="14"> <span class="dt">metrics =</span> <span class="kw">c</span>(<span class="st">"acc"</span>)</a>
<a class="sourceLine" id="cb22-15" title="15">)</a>
<a class="sourceLine" id="cb22-16" title="16"></a>
<a class="sourceLine" id="cb22-17" title="17">history =<span class="st"> </span>model <span class="op">%>%</span><span class="st"> </span><span class="kw">fit</span>(</a>
<a class="sourceLine" id="cb22-18" title="18"> x_train, y_train,</a>
<a class="sourceLine" id="cb22-19" title="19"> <span class="dt">epochs =</span> <span class="dv">10</span>, <span class="co">#10</span></a>
<a class="sourceLine" id="cb22-20" title="20"> <span class="dt">batch_size =</span> <span class="dv">32</span>,</a>
<a class="sourceLine" id="cb22-21" title="21"> <span class="dt">validation_split =</span> <span class="fl">0.2</span></a>
<a class="sourceLine" id="cb22-22" title="22">)</a>
<a class="sourceLine" id="cb22-23" title="23"></a>
<a class="sourceLine" id="cb22-24" title="24"><span class="kw">plot</span>(history)</a></code></pre></div>
<pre><code>## `geom_smooth()` using formula 'y ~ x'</code></pre>
<p><img src="NLP-book_files/figure-html/layer_embedding%20-1.png" width="672" /></p>
</div>
<div id="pre-trained-word-embeddings" class="section level3">
<h3><span class="header-section-number">3.3.2</span> Pre-trained word embeddings</h3>
<p>When we have little training data available to learn task-specific word embedding base on our vocabulary, it is preferable to use a pre-trained word embeddings. This technic is simular to transfer learning in image classification tasks, where we use a pretrained classifier. A pre-computed embedding is supposed to capture generic aspects of language structure. These word embeddings are trained based on co-occurence of words in sentences and documents within a large corpus of text. We can distinguish two main powerful word embeddings models: <strong>Word2Vec</strong> and <strong>GloVe</strong>.</p>
<div id="word2vec" class="section level4">
<h4><span class="header-section-number">3.3.2.1</span> Word2Vec</h4>
</div>
<div id="glove" class="section level4">
<h4><span class="header-section-number">3.3.2.2</span> Glove</h4>
</div>
</div>
</div>
<div id="applications" class="section level2">
<h2><span class="header-section-number">3.4</span> Applications</h2>
<div id="using-skip-gram" class="section level3">
<h3><span class="header-section-number">3.4.1</span> Using Skip-Gram</h3>
<p>We use the Amazon Fine Foods Reviews datset which consists of 500,000 reviews of Amazon fine food including product and user information, ratings, and narrative text.
source: <a href="https://blogs.rstudio.com/tensorflow/posts/2017-12-22-word-embeddings-with-keras/" class="uri">https://blogs.rstudio.com/tensorflow/posts/2017-12-22-word-embeddings-with-keras/</a></p>
<div id="getting-the-data" class="section level4">
<h4><span class="header-section-number">3.4.1.1</span> Getting the data</h4>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb24-1" title="1"><span class="co"># we download the data</span></a>
<a class="sourceLine" id="cb24-2" title="2"><span class="kw">download.file</span>(<span class="st">"https://snap.stanford.edu/data/finefoods.txt.gz"</span>, <span class="st">"finefoods.txt.gz"</span>)</a></code></pre></div>
<p>Now we load the plain text reviexs:</p>
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb25-1" title="1"><span class="kw">library</span>(readr)</a>
<a class="sourceLine" id="cb25-2" title="2"><span class="kw">library</span>(stringr)</a>
<a class="sourceLine" id="cb25-3" title="3">reviews <-<span class="st"> </span><span class="kw">read_lines</span>(<span class="st">"finefoods.txt.gz"</span>) </a>
<a class="sourceLine" id="cb25-4" title="4">reviews <-<span class="st"> </span>reviews[<span class="kw">str_sub</span>(reviews, <span class="dv">1</span>, <span class="dv">12</span>) <span class="op">==</span><span class="st"> "review/text:"</span>]</a>
<a class="sourceLine" id="cb25-5" title="5">reviews <-<span class="st"> </span><span class="kw">str_sub</span>(reviews, <span class="dt">start =</span> <span class="dv">14</span>)</a>
<a class="sourceLine" id="cb25-6" title="6">reviews <-<span class="st"> </span><span class="kw">iconv</span>(reviews, <span class="dt">to =</span> <span class="st">"UTF-8"</span>)</a>
<a class="sourceLine" id="cb25-7" title="7"><span class="kw">head</span>(reviews, <span class="dv">2</span>)</a></code></pre></div>
<pre><code>## [1] "I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most."
## [2] "Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as \"Jumbo\"."</code></pre>
</div>
<div id="preprocessing" class="section level4">
<h4><span class="header-section-number">3.4.1.2</span> Preprocessing</h4>
<p>We use <code>text_tokenizer</code> in order to transform each review into a sequence of integer tokens. By fixing <code>num_words = 20000</code>, we assign integer token to each of the 20,000 most common words (the other words will be assigned to token 0).</p>
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb27-1" title="1"><span class="kw">library</span>(keras)</a>
<a class="sourceLine" id="cb27-2" title="2">tokenizer =<span class="st"> </span><span class="kw">text_tokenizer</span>(<span class="dt">num_words =</span> <span class="dv">20000</span>)</a>
<a class="sourceLine" id="cb27-3" title="3">tokenizer <span class="op">%>%</span><span class="st"> </span><span class="kw">fit_text_tokenizer</span>(reviews)</a>
<a class="sourceLine" id="cb27-4" title="4"><span class="co">#we can show the number of documents</span></a>
<a class="sourceLine" id="cb27-5" title="5">tokenizer<span class="op">$</span>document_count</a></code></pre></div>
<pre><code>## [1] 568454</code></pre>
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb29-1" title="1"><span class="co"># we can show the word index list</span></a>
<a class="sourceLine" id="cb29-2" title="2">tokenizer<span class="op">$</span>word_index <span class="op">%>%</span></a>
<a class="sourceLine" id="cb29-3" title="3"><span class="st"> </span><span class="kw">head</span>()</a></code></pre></div>
<pre><code>## $the
## [1] 1
##
## $i
## [1] 2
##
## $and
## [1] 3
##
## $a
## [1] 4
##
## $to
## [1] 5
##
## $it
## [1] 6</code></pre>
</div>
<div id="skpi-gram-model" class="section level4">
<h4><span class="header-section-number">3.4.1.3</span> Skpi-Gram model</h4>
<p>In the skip-gram model, we use each word as input to a log-linear classifier, then predict words within a certain range before and after this word. It would be very compyationally expensive if we outpt a probability distribution over all the vocabulary for each target word we input in the model. Therefore, we will use negative sampling. It consists of sampling some words that don’t appear i the context and train a binary classifier to predict if the context word we passed is truly from the context or not.</p>
<p>Let’s defin a generator function to yield batches for model training. This genratire function will receive a vector of texts, a tokenizer and the arguments for the skip-gram (the size of the window around each target word we exaine and how manu=y negative samples we ant to sample for each target word).</p>
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb31-1" title="1"><span class="kw">library</span>(reticulate)</a>
<a class="sourceLine" id="cb31-2" title="2"><span class="kw">library</span>(purrr)</a>
<a class="sourceLine" id="cb31-3" title="3">skipgrams_generator <-<span class="st"> </span><span class="cf">function</span>(text, tokenizer, window_size, negative_samples) {</a>
<a class="sourceLine" id="cb31-4" title="4"> gen <-<span class="st"> </span><span class="kw">texts_to_sequences_generator</span>(tokenizer, <span class="kw">sample</span>(text))</a>
<a class="sourceLine" id="cb31-5" title="5"> <span class="cf">function</span>() {</a>
<a class="sourceLine" id="cb31-6" title="6"> skip <-<span class="st"> </span><span class="kw">generator_next</span>(gen) <span class="op">%>%</span></a>
<a class="sourceLine" id="cb31-7" title="7"><span class="st"> </span><span class="kw">skipgrams</span>(</a>
<a class="sourceLine" id="cb31-8" title="8"> <span class="dt">vocabulary_size =</span> tokenizer<span class="op">$</span>num_words, </a>
<a class="sourceLine" id="cb31-9" title="9"> <span class="dt">window_size =</span> window_size, </a>
<a class="sourceLine" id="cb31-10" title="10"> <span class="dt">negative_samples =</span> <span class="dv">1</span></a>
<a class="sourceLine" id="cb31-11" title="11"> )</a>
<a class="sourceLine" id="cb31-12" title="12"> x <-<span class="st"> </span><span class="kw">transpose</span>(skip<span class="op">$</span>couples) <span class="op">%>%</span><span class="st"> </span><span class="kw">map</span>(. <span class="op">%>%</span><span class="st"> </span>unlist <span class="op">%>%</span><span class="st"> </span><span class="kw">as.matrix</span>(<span class="dt">ncol =</span> <span class="dv">1</span>))</a>
<a class="sourceLine" id="cb31-13" title="13"> y <-<span class="st"> </span>skip<span class="op">$</span>labels <span class="op">%>%</span><span class="st"> </span><span class="kw">as.matrix</span>(<span class="dt">ncol =</span> <span class="dv">1</span>)</a>
<a class="sourceLine" id="cb31-14" title="14"> <span class="kw">list</span>(x, y)</a>
<a class="sourceLine" id="cb31-15" title="15"> }</a>
<a class="sourceLine" id="cb31-16" title="16">} </a></code></pre></div>
<p>We define now the keras model using kers functional API.</p>
<div class="sourceCode" id="cb32"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb32-1" title="1"><span class="co"># Dimension of the embedding vector</span></a>
<a class="sourceLine" id="cb32-2" title="2">embedding_size =<span class="st"> </span><span class="dv">128</span> </a>
<a class="sourceLine" id="cb32-3" title="3"><span class="co"># how many words to consider left and right</span></a>
<a class="sourceLine" id="cb32-4" title="4">skip_window =<span class="st"> </span><span class="dv">5</span></a>
<a class="sourceLine" id="cb32-5" title="5"><span class="co"># number of negative examples to sample for each word</span></a>
<a class="sourceLine" id="cb32-6" title="6">num_sampled =<span class="st"> </span><span class="dv">1</span></a></code></pre></div>
<p>We will write placeholders for the inputs using <code>layer_input</code> function</p>
<div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb33-1" title="1">input_target =<span class="st"> </span><span class="kw">layer_input</span>(<span class="dt">shape =</span> <span class="dv">1</span>)</a>
<a class="sourceLine" id="cb33-2" title="2">input_context =<span class="st"> </span><span class="kw">layer_input</span>(<span class="dt">shape =</span> <span class="dv">1</span>)</a></code></pre></div>
<p>Now let’s define the embedding matrix. The embedding is a matrix with dimensions (vocabulary, embedding_size) that acts as lookup table for the word vectors.</p>
<div class="sourceCode" id="cb34"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb34-1" title="1">embedding <-<span class="st"> </span><span class="kw">layer_embedding</span>(</a>
<a class="sourceLine" id="cb34-2" title="2"> <span class="dt">input_dim =</span> tokenizer<span class="op">$</span>num_words <span class="op">+</span><span class="st"> </span><span class="dv">1</span>, </a>
<a class="sourceLine" id="cb34-3" title="3"> <span class="dt">output_dim =</span> embedding_size, </a>
<a class="sourceLine" id="cb34-4" title="4"> <span class="dt">input_length =</span> <span class="dv">1</span>, </a>
<a class="sourceLine" id="cb34-5" title="5"> <span class="dt">name =</span> <span class="st">"embedding"</span></a>
<a class="sourceLine" id="cb34-6" title="6">)</a>
<a class="sourceLine" id="cb34-7" title="7"></a>
<a class="sourceLine" id="cb34-8" title="8">target_vector <-<span class="st"> </span>input_target <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb34-9" title="9"><span class="st"> </span><span class="kw">embedding</span>() <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb34-10" title="10"><span class="st"> </span><span class="kw">layer_flatten</span>()</a>
<a class="sourceLine" id="cb34-11" title="11"></a>
<a class="sourceLine" id="cb34-12" title="12">context_vector <-<span class="st"> </span>input_context <span class="op">%>%</span></a>
<a class="sourceLine" id="cb34-13" title="13"><span class="st"> </span><span class="kw">embedding</span>() <span class="op">%>%</span></a>
<a class="sourceLine" id="cb34-14" title="14"><span class="st"> </span><span class="kw">layer_flatten</span>()</a></code></pre></div>
<p>Now we define how the <code>target_vector</code> will be related to the <code>context_vector</code> in order to make the network output equal to 1 when the context word really appeared in the contexte and 0 otherwise. We want target_vector to be similar to the context_vector if they appeared in the same context. A typical measure of similarity is the cosine similarity. Give two vectors A and B the cosine similarity is defined by the Euclidean Dot product of A and B normalized by their magnitude. As we don’t need the similarity to be normalized inside the network, we will only calculate the dot product and then output a dense layer with sigmoid activation.</p>
<div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb35-1" title="1">dot_product <-<span class="st"> </span><span class="kw">layer_dot</span>(<span class="kw">list</span>(target_vector, context_vector), <span class="dt">axes =</span> <span class="dv">1</span>)</a>
<a class="sourceLine" id="cb35-2" title="2">output <-<span class="st"> </span><span class="kw">layer_dense</span>(dot_product, <span class="dt">units =</span> <span class="dv">1</span>, <span class="dt">activation =</span> <span class="st">"sigmoid"</span>)</a></code></pre></div>
<p>Let’s create and compile the model</p>
<div class="sourceCode" id="cb36"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb36-1" title="1">model <-<span class="st"> </span><span class="kw">keras_model</span>(<span class="kw">list</span>(input_target, input_context), output)</a>
<a class="sourceLine" id="cb36-2" title="2">model <span class="op">%>%</span><span class="st"> </span><span class="kw">compile</span>(<span class="dt">loss =</span> <span class="st">"binary_crossentropy"</span>, <span class="dt">optimizer =</span> <span class="st">"adam"</span>)</a>
<a class="sourceLine" id="cb36-3" title="3"><span class="kw">summary</span>(model)</a></code></pre></div>
<pre><code>## Model: "model"
## ________________________________________________________________________________
## Layer (type) Output Shape Param # Connected to
## ================================================================================
## input_1 (InputLayer) [(None, 1)] 0
## ________________________________________________________________________________
## input_2 (InputLayer) [(None, 1)] 0
## ________________________________________________________________________________
## embedding (Embedding) (None, 1, 128) 2560128 input_1[0][0]
## input_2[0][0]
## ________________________________________________________________________________
## flatten_1 (Flatten) (None, 128) 0 embedding[0][0]
## ________________________________________________________________________________
## flatten_2 (Flatten) (None, 128) 0 embedding[1][0]
## ________________________________________________________________________________
## dot (Dot) (None, 1) 0 flatten_1[0][0]
## flatten_2[0][0]
## ________________________________________________________________________________
## dense_1 (Dense) (None, 1) 2 dot[0][0]
## ================================================================================
## Total params: 2,560,130
## Trainable params: 2,560,130
## Non-trainable params: 0
## ________________________________________________________________________________</code></pre>
</div>
<div id="model-training" class="section level4">
<h4><span class="header-section-number">3.4.1.4</span> Model training</h4>
<p>To fit the model we need to specify the number of training steps and the number of epochs. We will use only one epoch for time computation reasons.</p>
<div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb38-1" title="1">model <span class="op">%>%</span></a>
<a class="sourceLine" id="cb38-2" title="2"><span class="st"> </span><span class="kw">fit_generator</span>(</a>
<a class="sourceLine" id="cb38-3" title="3"> <span class="kw">skipgrams_generator</span>(reviews, tokenizer, skip_window, negative_samples), </a>
<a class="sourceLine" id="cb38-4" title="4"> <span class="dt">steps_per_epoch =</span> <span class="dv">2000</span>, <span class="dt">epochs =</span> <span class="dv">2</span></a>
<a class="sourceLine" id="cb38-5" title="5"> )</a></code></pre></div>
<p>We can extract the embedding matrix from the model using the <code>get_weights()</code> function.</p>
<div class="sourceCode" id="cb39"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb39-1" title="1"><span class="kw">library</span>(dplyr)</a>
<a class="sourceLine" id="cb39-2" title="2"></a>
<a class="sourceLine" id="cb39-3" title="3">embedding_matrix <-<span class="st"> </span><span class="kw">get_weights</span>(model)[[<span class="dv">1</span>]]</a>
<a class="sourceLine" id="cb39-4" title="4"></a>
<a class="sourceLine" id="cb39-5" title="5">words <-<span class="st"> </span><span class="kw">data_frame</span>(</a>
<a class="sourceLine" id="cb39-6" title="6"> <span class="dt">word =</span> <span class="kw">names</span>(tokenizer<span class="op">$</span>word_index), </a>
<a class="sourceLine" id="cb39-7" title="7"> <span class="dt">id =</span> <span class="kw">as.integer</span>(<span class="kw">unlist</span>(tokenizer<span class="op">$</span>word_index))</a>
<a class="sourceLine" id="cb39-8" title="8">)</a></code></pre></div>
<pre><code>## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.</code></pre>
<div class="sourceCode" id="cb41"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb41-1" title="1">words <-<span class="st"> </span>words <span class="op">%>%</span></a>
<a class="sourceLine" id="cb41-2" title="2"><span class="st"> </span><span class="kw">filter</span>(id <span class="op"><=</span><span class="st"> </span>tokenizer<span class="op">$</span>num_words) <span class="op">%>%</span></a>
<a class="sourceLine" id="cb41-3" title="3"><span class="st"> </span><span class="kw">arrange</span>(id)</a>
<a class="sourceLine" id="cb41-4" title="4"></a>
<a class="sourceLine" id="cb41-5" title="5"><span class="kw">row.names</span>(embedding_matrix) <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"UNK"</span>, words<span class="op">$</span>word)</a>
<a class="sourceLine" id="cb41-6" title="6"><span class="kw">dim</span>(embedding_matrix)</a></code></pre></div>
<pre><code>## [1] 20001 128</code></pre>
<div class="sourceCode" id="cb43"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb43-1" title="1"><span class="kw">head</span>(embedding_matrix)</a></code></pre></div>
<pre><code>## [,1] [,2] [,3] [,4] [,5] [,6]
## UNK 0.0170815 -0.001318716 0.04716846 0.0001694448 -0.01495628 0.01284777
## the 0.1174329 -0.383204609 -0.14486948 0.2978044450 0.17878745 -0.28364837
## i 0.1669428 -0.247277036 -0.21312974 0.3762883544 0.28889671 -0.30660424
## and 0.1605144 -0.230475992 -0.17357136 0.3322810829 0.20980199 -0.25617325
## a 0.1201052 -0.146714032 -0.15440495 0.3936571181 0.22841439 -0.16955599
## to 0.1918591 -0.273127079 -0.17826691 0.3529382646 0.28203753 -0.24766697
## [,7] [,8] [,9] [,10] [,11]
## UNK -0.007465005 -0.01517171 0.03539601 0.04036165 -0.0001689792
## the 0.299164832 -0.30537584 -0.36950541 -0.28876448 -0.2681583762
## i 0.362938732 -0.26686862 -0.34998602 -0.23723570 -0.2789547443
## and 0.310600638 -0.21819803 -0.34236768 -0.32119495 -0.1779417843
## a 0.278418452 -0.25963432 -0.35211003 -0.31251714 -0.2506726086
## to 0.347074360 -0.16523284 -0.32140639 -0.28436336 -0.2654981911
## [,12] [,13] [,14] [,15] [,16] [,17]
## UNK -0.0004024878 -0.01031651 -0.0329108 -0.03570246 -0.01811307 -0.02383759
## the 0.2796629667 0.31438842 -0.2472963 0.20960861 -0.28063810 0.17194803
## i 0.3474592566 0.33163065 -0.3115005 0.29770941 -0.32637006 0.09885775
## and 0.3099879324 0.28850329 -0.2122750 0.25450185 -0.35340777 0.15842982
## a 0.3022259772 0.33895627 -0.2288224 0.27653986 -0.23382780 0.15667632
## to 0.3335136473 0.27410549 -0.3449964 0.31313083 -0.33614126 0.02599920
## [,18] [,19] [,20] [,21] [,22] [,23]
## UNK 0.001930606 -0.04853332 0.03184437 -0.03507299 0.03099043 -0.01559343
## the 0.249449074 0.38885596 -0.10571286 0.03303542 -0.01977662 0.12713103
## i 0.266584009 0.32550138 -0.14738728 0.10654508 0.04477475 0.28375289
## and 0.281591505 0.33543691 -0.12878042 0.19387311 0.08222436 0.23661953
## a 0.213139340 0.28658631 -0.13082723 0.11910986 0.03216605 0.19973753
## to 0.144440114 0.35665330 -0.12372198 0.12668315 0.19030270 0.19488603
## [,24] [,25] [,26] [,27] [,28] [,29]
## UNK -0.02061616 0.01281368 -0.03231498 0.007334851 0.03719245 0.03511449
## the -0.17688274 -0.32929045 -0.29310527 0.289794803 0.13527869 -0.27275386
## i -0.23470533 -0.32637039 -0.26748180 0.318160236 0.03208359 -0.21226105
## and -0.25599492 -0.34276465 -0.33586788 0.217786700 0.02351790 -0.27456364
## a -0.17987984 -0.37211826 -0.33599785 0.294163078 -0.06172014 -0.27303630
## to -0.25984651 -0.32187936 -0.29270798 0.297974795 -0.07057865 -0.25982314
## [,30] [,31] [,32] [,33] [,34] [,35]
## UNK -0.01691567 0.0008091219 -0.03200240 -0.006604362 -0.03469641 -0.03299238
## the 0.25344208 0.1719853282 0.07652652 -0.321915329 0.23948634 0.01470774
## i 0.22992177 0.0279782731 0.08346167 -0.304561466 0.12635729 0.02191944
## and 0.18178248 0.1102448180 0.08257330 -0.269136578 0.20793088 -0.09406497
## a 0.18358734 0.2180162519 0.05090325 -0.267941982 0.16697714 0.03600145
## to 0.25895324 -0.0176757276 0.13935305 -0.208568946 0.19998248 -0.05754858
## [,36] [,37] [,38] [,39] [,40] [,41]
## UNK 0.03315267 0.04222684 -0.02326957 -0.03361871 -0.0365397 -0.043440260
## the -0.28853074 0.37395093 0.20946701 0.15802002 0.2907013 -0.010865721
## i -0.36051789 0.36837849 0.31315809 0.19938451 0.2936096 -0.131256387
## and -0.27480209 0.37666577 0.28615251 0.18097939 0.2794661 -0.006140751
## a -0.27293894 0.33773720 0.32223794 0.22417280 0.2404891 -0.039677981
## to -0.30174917 0.30440938 0.29927579 0.14390787 0.2480671 -0.093178861
## [,42] [,43] [,44] [,45] [,46] [,47]
## UNK 0.01521539 0.00477301 0.0318060 -0.02853984 0.0437434 -0.01864365
## the 0.30115995 0.05320331 0.3138930 0.10096217 -0.2055138 -0.20989250
## i 0.35655829 -0.11047912 0.3219035 0.07911555 -0.2606398 -0.12337824
## and 0.33989990 0.04763594 0.2755820 0.04128102 -0.1459931 -0.06884515
## a 0.35712439 0.04051579 0.2893077 0.08757305 -0.1562458 -0.03052964
## to 0.36761856 -0.03275176 0.3164682 0.02745412 -0.2089933 -0.10701507
## [,48] [,49] [,50] [,51] [,52] [,53]
## UNK 0.04101357 0.03473446 0.04457737 -0.00114752 -0.04412064 -0.03156789
## the 0.02520799 -0.25871646 -0.37619755 0.38879156 -0.25250259 -0.39248210
## i 0.01821698 -0.22392237 -0.32261470 0.35049382 -0.27527589 -0.38099816
## and -0.05184063 -0.16932927 -0.33808553 0.37406263 -0.32858366 -0.34304696
## a -0.09030625 -0.29941657 -0.30359963 0.29772824 -0.26200148 -0.27937773
## to -0.01727733 -0.24934988 -0.25524116 0.35741013 -0.29380852 -0.32908934
## [,54] [,55] [,56] [,57] [,58] [,59]
## UNK 0.006125987 -0.02187279 0.03910586 -0.01629753 0.04972812 0.03518805
## the -0.310638994 0.36762497 -0.10685650 0.34639886 0.35899246 -0.36781129
## i -0.349121183 0.37237552 -0.21593590 0.31275219 0.42300200 -0.30578846
## and -0.344224006 0.30466965 -0.15587379 0.32809687 0.32287005 -0.33381802
## a -0.350583643 0.32805666 -0.11700507 0.27578056 0.28753132 -0.29372326
## to -0.369859546 0.37795553 -0.16025186 0.28375304 0.33897606 -0.31854972
## [,60] [,61] [,62] [,63] [,64] [,65]
## UNK 0.04667592 -0.01325144 0.04597353 0.007199753 0.04204318 0.004700471
## the 0.27253532 0.14934577 0.30485842 -0.316160858 -0.27030918 -0.083663106
## i 0.23735967 0.11750556 0.31638011 -0.297600389 -0.21566036 -0.096772537
## and 0.18881306 0.06630570 0.31316441 -0.284757823 -0.26432148 -0.110183306
## a 0.23069674 0.08311401 0.23403980 -0.271946669 -0.19995712 -0.172960505
## to 0.29635924 0.04342796 0.25315505 -0.227531537 -0.28717574 -0.092312992
## [,66] [,67] [,68] [,69] [,70] [,71]
## UNK -0.02999171 -0.01793531 0.04247624 0.002061225 -0.02451816 0.02576012
## the -0.11517787 0.12338851 -0.04987039 -0.034545448 -0.06101116 0.05578730
## i 0.06015281 0.01339131 0.00329218 -0.055252384 -0.08577745 0.12629287
## and -0.07259820 -0.03888071 0.01285036 -0.006432456 -0.02521065 0.09755906
## a 0.01760340 -0.11909988 0.08444444 -0.040843900 0.01344274 0.11524677
## to 0.04376663 -0.09150210 0.03593209 -0.064415947 -0.08948220 0.10198851
## [,72] [,73] [,74] [,75] [,76] [,77]
## UNK 0.008057524 -0.01873593 0.004101884 -0.02451124 -0.01045469 -0.007409252
## the -0.210249856 0.24684344 -0.178273425 0.12195436 -0.14832009 -0.229609445
## i -0.339593619 0.25152388 -0.016085856 0.11553814 -0.24507025 -0.292536318
## and -0.210022196 0.24458480 -0.108235233 0.12692633 -0.15382548 -0.244550496
## a -0.300737977 0.19017297 -0.177162036 0.09492953 -0.15454216 -0.303135395
## to -0.386955142 0.19876392 -0.071368732 0.08350065 -0.25868317 -0.269913286
## [,78] [,79] [,80] [,81] [,82] [,83]
## UNK 0.02775276 0.02282996 0.00179093 0.001864087 -0.02228262 0.02817461
## the -0.35284138 0.27658230 -0.26196435 0.198721379 -0.04214166 -0.37608743
## i -0.37237868 0.28755802 -0.29970258 0.303372979 -0.02192708 -0.34754741
## and -0.32777765 0.19116035 -0.25336358 0.194186181 0.01598708 -0.33635759
## a -0.30274969 0.23791181 -0.28293476 0.306074739 -0.07584713 -0.36234859
## to -0.32587340 0.23608799 -0.21567842 0.323878646 -0.03771929 -0.36036131
## [,84] [,85] [,86] [,87] [,88] [,89]
## UNK -0.003465034 0.01472980 0.04781802 -0.007102478 0.04773111 -0.01337481
## the 0.288525522 -0.08883095 -0.22900291 -0.169002935 0.16737778 -0.19447237
## i 0.313215643 -0.19014385 -0.28267121 -0.305975825 0.15154187 -0.13293040
## and 0.332025588 -0.14811261 -0.27446076 -0.231908619 0.10666943 -0.24451743
## a 0.283321589 -0.14821632 -0.25289920 -0.340203524 0.18615662 -0.28705639
## to 0.297496915 -0.17047712 -0.22169787 -0.278337449 0.18933631 -0.13048588
## [,90] [,91] [,92] [,93] [,94] [,95]
## UNK -0.01147924 -0.00587051 0.04609055 -0.0001483187 0.04964909 -0.01832782
## the -0.32560977 -0.17560473 -0.25437045 0.1731803566 0.28145239 0.37646624
## i -0.37563258 -0.22397560 -0.19170810 0.2082554251 0.29436311 0.39287877
## and -0.30389935 -0.06919212 -0.20327331 0.2024096847 0.29195097 0.38377520
## a -0.28205562 -0.17666516 -0.23301259 0.2374358177 0.33052328 0.34986836
## to -0.30297431 -0.15620883 -0.13064004 0.2584783733 0.28516826 0.32652554
## [,96] [,97] [,98] [,99] [,100] [,101]
## UNK 0.007632814 0.005672503 -0.03266094 -0.03422055 0.01324456 0.02385963
## the 0.302464426 0.110885777 -0.25989842 0.38501096 -0.21835038 -0.20524764
## i 0.296282232 0.233367577 -0.20001604 0.38733375 -0.24814697 -0.15178022
## and 0.310684830 0.118035696 -0.17034222 0.33256108 -0.19430402 -0.18163270
## a 0.337718070 0.206213146 -0.24668492 0.32690060 -0.22618128 -0.18086091
## to 0.365883678 0.275636226 -0.25219843 0.31734639 -0.24367192 -0.09583790
## [,102] [,103] [,104] [,105] [,106] [,107]
## UNK -0.04407374 -0.03826056 0.001508869 -0.01589622 -0.02959334 0.02968781
## the 0.19846255 0.21909449 -0.077848770 0.24505678 0.34444517 0.16305423
## i 0.28018263 0.23005924 -0.110724866 0.19277629 0.37343127 0.09825584
## and 0.24288949 0.20457232 -0.084622771 0.14417087 0.26135206 0.06721097
## a 0.29050061 0.30875629 -0.190523997 0.18856290 0.30580518 0.14732127
## to 0.26581255 0.28777468 -0.149579942 0.14878766 0.26804748 0.02433010
## [,108] [,109] [,110] [,111] [,112] [,113]
## UNK 0.032761965 -0.03484957 0.04724887 -0.01673875 0.04846228 0.01651904
## the 0.072625257 -0.22000135 0.29008779 0.21206540 -0.08905456 0.30720186
## i 0.137971386 -0.30346900 0.32460919 0.24432959 -0.04133808 0.37891588
## and 0.067033872 -0.28801164 0.25215000 0.20263389 -0.07493820 0.23705998
## a 0.008180861 -0.27213508 0.28106183 0.26201847 -0.13696785 0.24244435
## to 0.133614138 -0.27308589 0.25494623 0.20437345 0.09366596 0.26203859
## [,114] [,115] [,116] [,117] [,118] [,119]
## UNK 0.03307271 0.006484438 0.04113635 -0.04053799 0.03232178 -0.01916174
## the 0.37055835 0.000467650 -0.33548412 -0.06496470 -0.36024690 -0.24840827
## i 0.32405031 0.068578012 -0.42496726 -0.02733710 -0.32280901 -0.25782296
## and 0.37622270 0.064659454 -0.35911053 0.03896505 -0.33915231 -0.27443057
## a 0.37359667 0.117730454 -0.29758999 0.10123279 -0.32194731 -0.21069331
## to 0.32364631 0.143883005 -0.31454709 0.14705095 -0.27599317 -0.22744414
## [,120] [,121] [,122] [,123] [,124] [,125]
## UNK -0.00625832 -0.04583708 0.01545075 0.03551065 -0.02845714 -0.024710560
## the 0.23234977 0.37063292 -0.06700579 0.18953991 -0.30410570 -0.069787078
## i 0.36668271 0.31222266 -0.09937420 0.27160498 -0.34567764 -0.001867783
## and 0.33828005 0.33334142 -0.06587321 0.30039573 -0.28826779 0.067308143
## a 0.30670270 0.39588228 -0.05656852 0.15388277 -0.30768496 0.107387684
## to 0.30960637 0.33873239 -0.03573503 0.23836206 -0.29423913 0.053528536
## [,126] [,127] [,128]
## UNK 0.03197979 0.02286074 -0.04385829
## the 0.36302400 0.25149447 0.23504810
## i 0.39596969 0.26540828 0.26525143
## and 0.33469549 0.24421459 0.21839896
## a 0.31807998 0.26717538 0.26289040
## to 0.36336777 0.25848782 0.25542757</code></pre>
</div>
<div id="understanding-the-embeddings" class="section level4">
<h4><span class="header-section-number">3.4.1.5</span> Understanding the embeddings</h4>
<p>We can now find words that are close to each other in the embedding. We will use the cosine similarity, since this is what we trained the model to minimize.</p>
<div class="sourceCode" id="cb45"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb45-1" title="1"><span class="kw">library</span>(text2vec)</a></code></pre></div>
<pre><code>##
## Attaching package: 'text2vec'</code></pre>
<pre><code>## The following objects are masked from 'package:keras':
##
## fit, normalize</code></pre>
<div class="sourceCode" id="cb48"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb48-1" title="1">find_similar_words <-<span class="st"> </span><span class="cf">function</span>(word, embedding_matrix, <span class="dt">n =</span> <span class="dv">5</span>) {</a>
<a class="sourceLine" id="cb48-2" title="2"> similarities <-<span class="st"> </span>embedding_matrix[word, , drop =<span class="st"> </span><span class="ot">FALSE</span>] <span class="op">%>%</span></a>
<a class="sourceLine" id="cb48-3" title="3"><span class="st"> </span><span class="kw">sim2</span>(embedding_matrix, <span class="dt">y =</span> ., <span class="dt">method =</span> <span class="st">"cosine"</span>)</a>
<a class="sourceLine" id="cb48-4" title="4"> </a>
<a class="sourceLine" id="cb48-5" title="5"> similarities[,<span class="dv">1</span>] <span class="op">%>%</span><span class="st"> </span><span class="kw">sort</span>(<span class="dt">decreasing =</span> <span class="ot">TRUE</span>) <span class="op">%>%</span><span class="st"> </span><span class="kw">head</span>(n)</a>
<a class="sourceLine" id="cb48-6" title="6">}</a>
<a class="sourceLine" id="cb48-7" title="7"></a>
<a class="sourceLine" id="cb48-8" title="8"></a>
<a class="sourceLine" id="cb48-9" title="9"><span class="kw">find_similar_words</span>(<span class="st">"delicious"</span>, embedding_matrix)</a></code></pre></div>
<pre><code>## delicious bought green texture price
## 1.0000000 0.9809152 0.9789813 0.9783692 0.9781281</code></pre>
<div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb50-1" title="1"><span class="kw">find_similar_words</span>(<span class="st">"cats"</span>, embedding_matrix)</a></code></pre></div>
<pre><code>## cats chocolate best too bag
## 1.0000000 0.9785330 0.9782802 0.9773057 0.9770379</code></pre>
<p>The t-SNE algorithm can be used to visualize the embeddings. Because of time constraints we will only use it with the first 500 words. o understand more about the t-SNE method see the article: <a href="https://distill.pub/2016/misread-tsne/" class="uri">https://distill.pub/2016/misread-tsne/</a></p>
<div class="sourceCode" id="cb52"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb52-1" title="1"><span class="kw">library</span>(Rtsne)</a>
<a class="sourceLine" id="cb52-2" title="2"><span class="kw">library</span>(ggplot2)</a>
<a class="sourceLine" id="cb52-3" title="3"><span class="kw">library</span>(plotly)</a></code></pre></div>
<pre><code>##
## Attaching package: 'plotly'</code></pre>
<pre><code>## The following object is masked from 'package:ggplot2':
##
## last_plot</code></pre>
<pre><code>## The following object is masked from 'package:stats':
##
## filter</code></pre>
<pre><code>## The following object is masked from 'package:graphics':
##
## layout</code></pre>
<div class="sourceCode" id="cb57"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb57-1" title="1">tsne <-<span class="st"> </span><span class="kw">Rtsne</span>(embedding_matrix[<span class="dv">2</span><span class="op">:</span><span class="dv">500</span>,], <span class="dt">perplexity =</span> <span class="dv">50</span>, <span class="dt">pca =</span> <span class="ot">FALSE</span>)</a>
<a class="sourceLine" id="cb57-2" title="2"></a>
<a class="sourceLine" id="cb57-3" title="3">tsne_plot <-<span class="st"> </span>tsne<span class="op">$</span>Y <span class="op">%>%</span></a>
<a class="sourceLine" id="cb57-4" title="4"><span class="st"> </span><span class="kw">as.data.frame</span>() <span class="op">%>%</span></a>
<a class="sourceLine" id="cb57-5" title="5"><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">word =</span> <span class="kw">row.names</span>(embedding_matrix)[<span class="dv">2</span><span class="op">:</span><span class="dv">500</span>]) <span class="op">%>%</span></a>
<a class="sourceLine" id="cb57-6" title="6"><span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> V1, <span class="dt">y =</span> V2, <span class="dt">label =</span> word)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb57-7" title="7"><span class="st"> </span><span class="kw">geom_text</span>(<span class="dt">size =</span> <span class="dv">3</span>)</a>
<a class="sourceLine" id="cb57-8" title="8">tsne_plot</a></code></pre></div>
<p><img src="NLP-book_files/figure-html/Rtsne%20-1.png" width="672" /></p>
</div>
</div>
<div id="using-glove" class="section level3">
<h3><span class="header-section-number">3.4.2</span> Using GloVe</h3>
<p>source: <a href="http://text2vec.org/glove.html" class="uri">http://text2vec.org/glove.html</a></p>
<p>In this example, we will use GloVe to test how much it captures linguistic regularities. By takig the word vectors corresponding to the words: “Paris”, “france”, and “gremany”, we are supposed to obtain “berlin” as closest resulting vector.
<span class="math inline">\(vector("paris") - vector("france) + vector("germany")\)</span></p>
<p>we will use the wikpiedeia data which is used as a demo by wor2vec.</p>
<div class="sourceCode" id="cb58"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb58-1" title="1"><span class="co"># download data</span></a>
<a class="sourceLine" id="cb58-2" title="2"><span class="co"># download.file("http://mattmahoney.net/dc/text8.zip", "D:/NLP/NLP-book/data/text8.zip")</span></a>
<a class="sourceLine" id="cb58-3" title="3"><span class="co"># unzip("D:/NLP/NLP-book/data/text8.zip", files = "text8", exdir = "D:/NLP/NLP-book/data/text8")</span></a>
<a class="sourceLine" id="cb58-4" title="4"><span class="co"># load data</span></a>
<a class="sourceLine" id="cb58-5" title="5">wiki =<span class="st"> </span><span class="kw">readLines</span>(<span class="st">"D:/NLP/NLP-book/data/text8/text8"</span>, <span class="dt">n =</span> <span class="dv">1</span>, <span class="dt">warn =</span> <span class="ot">FALSE</span>)</a></code></pre></div>
<p>Now, we create a vocabulary constituted of set of words for wich we want to learn word vectors.</p>
<div class="sourceCode" id="cb59"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb59-1" title="1"><span class="co"># Create iterator over tokens</span></a>
<a class="sourceLine" id="cb59-2" title="2">tokens <-<span class="st"> </span><span class="kw">space_tokenizer</span>(wiki)</a>
<a class="sourceLine" id="cb59-3" title="3"><span class="co"># Create vocabulary. Terms will be unigrams (simple words).</span></a>
<a class="sourceLine" id="cb59-4" title="4">it =<span class="st"> </span><span class="kw">itoken</span>(tokens, <span class="dt">progressbar =</span> <span class="ot">FALSE</span>)</a>
<a class="sourceLine" id="cb59-5" title="5">vocab <-<span class="st"> </span><span class="kw">create_vocabulary</span>(it)</a>
<a class="sourceLine" id="cb59-6" title="6"><span class="kw">str</span>(vocab)</a></code></pre></div>
<pre><code>## Classes 'text2vec_vocabulary' and 'data.frame': 253854 obs. of 3 variables:
## $ term : chr "aaaaaacceglllnorst" "aaaaaaccegllnorrst" "aaaaaah" "aaaaaalmrsstt" ...
## $ term_count: int 1 1 1 1 1 1 1 1 1 1 ...
## $ doc_count : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "ngram")= Named int 1 1
## ..- attr(*, "names")= chr "ngram_min" "ngram_max"
## - attr(*, "document_count")= int 1
## - attr(*, "stopwords")= chr
## - attr(*, "sep_ngram")= chr "_"</code></pre>
<p>We should remove unbommon words since it is not meaningful to keep word vector for word that we saw only once in the entire corpus. In this example we will keep only ords which apear at least five times.</p>
<div class="sourceCode" id="cb61"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb61-1" title="1">vocab <-<span class="st"> </span><span class="kw">prune_vocabulary</span>(vocab, <span class="dt">term_count_min =</span> 5L)</a>
<a class="sourceLine" id="cb61-2" title="2"><span class="kw">min</span>(vocab<span class="op">$</span>term_count)</a></code></pre></div>
<pre><code>## [1] 5</code></pre>
<p>Now we have 71,290 terms in the vocabulary and are ready to construct term-co-occurence matrix (TCM).</p>
<div class="sourceCode" id="cb63"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb63-1" title="1"><span class="co"># Use our filtered vocabulary</span></a>
<a class="sourceLine" id="cb63-2" title="2">vectorizer <-<span class="st"> </span><span class="kw">vocab_vectorizer</span>(vocab)</a>
<a class="sourceLine" id="cb63-3" title="3"><span class="co"># use window of 5 for context words</span></a>
<a class="sourceLine" id="cb63-4" title="4">tcm <-<span class="st"> </span><span class="kw">create_tcm</span>(it, vectorizer, <span class="dt">skip_grams_window =</span> 5L)</a>
<a class="sourceLine" id="cb63-5" title="5">tcm[<span class="dv">1</span><span class="op">:</span><span class="dv">10</span>, <span class="dv">1</span><span class="op">:</span><span class="dv">10</span>]</a></code></pre></div>
<pre><code>## 10 x 10 sparse Matrix of class "dgTMatrix"</code></pre>
<pre><code>## [[ suppressing 10 column names 'aapke', 'ababda', 'abakumov' ... ]]</code></pre>
<pre><code>##
## aapke . . . . . . . . . .
## ababda . . . . . . . . . .
## abakumov . . . . . . . . . .
## abalones . . . . . . . . . .
## abano . . . . . . . . . .
## abati . . . . . . . . . .
## abbates . . . . . . 1.25 . . .
## abbesses . . . . . . . . . .
## abderus . . . . . . . . 1 .
## abdications . . . . . . . . . .</code></pre>
<p>Now we have a TCM matrix and can factorize it via the GloVe algorithm.</p>
<div class="sourceCode" id="cb67"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb67-1" title="1">glove =<span class="st"> </span>GlobalVectors<span class="op">$</span><span class="kw">new</span>(<span class="dt">rank =</span> <span class="dv">50</span>, <span class="dt">x_max =</span> <span class="dv">10</span>)</a>
<a class="sourceLine" id="cb67-2" title="2">wv_main =<span class="st"> </span>glove<span class="op">$</span><span class="kw">fit_transform</span>(tcm, <span class="dt">n_iter =</span> <span class="dv">10</span>, <span class="dt">convergence_tol =</span> <span class="fl">0.01</span>)</a></code></pre></div>
<pre><code>## INFO [23:50:33.466] epoch 1, loss 0.1745
## INFO [23:50:47.080] epoch 2, loss 0.1224
## INFO [23:51:00.388] epoch 3, loss 0.1083
## INFO [23:51:13.728] epoch 4, loss 0.1004
## INFO [23:51:27.567] epoch 5, loss 0.0953
## INFO [23:51:40.924] epoch 6, loss 0.0917
## INFO [23:51:54.514] epoch 7, loss 0.0889
## INFO [23:52:08.667] epoch 8, loss 0.0868
## INFO [23:52:22.352] epoch 9, loss 0.0850
## INFO [23:52:35.699] epoch 10, loss 0.0836</code></pre>
<div class="sourceCode" id="cb69"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb69-1" title="1"><span class="kw">dim</span>(wv_main)</a></code></pre></div>
<pre><code>## [1] 71290 50</code></pre>
<p>Note that model learns two sets of word vectors - main and context. Essentially they are the same since model is symmetric. From our experience learning two sets of word vectors leads to higher quality embeddings.</p>
<div class="sourceCode" id="cb71"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb71-1" title="1">wv_context =<span class="st"> </span>glove<span class="op">$</span>components</a>
<a class="sourceLine" id="cb71-2" title="2"><span class="kw">dim</span>(wv_context)</a></code></pre></div>
<pre><code>## [1] 50 71290</code></pre>
<p>While both of word-vectors matrices can be used as result it usually better (idea from GloVe paper) to average or take a sum of main and context vector:</p>
<div class="sourceCode" id="cb73"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb73-1" title="1">word_vectors =<span class="st"> </span>wv_main <span class="op">+</span><span class="st"> </span><span class="kw">t</span>(wv_context)</a></code></pre></div>
<p>We can find the closest word vectors for our paris - france + germany example:</p>
<div class="sourceCode" id="cb74"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb74-1" title="1">berlin =<span class="st"> </span>word_vectors[<span class="st">"paris"</span>, , drop =<span class="st"> </span><span class="ot">FALSE</span>] <span class="op">-</span><span class="st"> </span></a>
<a class="sourceLine" id="cb74-2" title="2"><span class="st"> </span>word_vectors[<span class="st">"france"</span>, , drop =<span class="st"> </span><span class="ot">FALSE</span>] <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb74-3" title="3"><span class="st"> </span>word_vectors[<span class="st">"germany"</span>, , drop =<span class="st"> </span><span class="ot">FALSE</span>]</a>
<a class="sourceLine" id="cb74-4" title="4">cos_sim =<span class="st"> </span><span class="kw">sim2</span>(<span class="dt">x =</span> word_vectors, <span class="dt">y =</span> berlin, <span class="dt">method =</span> <span class="st">"cosine"</span>, <span class="dt">norm =</span> <span class="st">"l2"</span>)</a>
<a class="sourceLine" id="cb74-5" title="5"><span class="kw">head</span>(<span class="kw">sort</span>(cos_sim[,<span class="dv">1</span>], <span class="dt">decreasing =</span> <span class="ot">TRUE</span>), <span class="dv">5</span>)</a></code></pre></div>
<pre><code>## paris berlin bonn london leipzig
## 0.7771973 0.7295444 0.6742783 0.6663386 0.6612857</code></pre>
</div>
</div>
<div id="references" class="section level2">
<h2><span class="header-section-number">3.5</span> references</h2>
<ul>
<li><a href="http://pablobarbera.com/ECPR-SC105/code/16-word-embeddings.html" class="uri">http://pablobarbera.com/ECPR-SC105/code/16-word-embeddings.html</a></li>
<li><a href="https://code.google.com/archive/p/word2vec/" class="uri">https://code.google.com/archive/p/word2vec/</a></li>
<li><a href="https://m-clark.github.io/text-analysis-with-R/word-embeddings.html#wikipedia" class="uri">https://m-clark.github.io/text-analysis-with-R/word-embeddings.html#wikipedia</a></li>
<li><a href="https://juliasilge.com/blog/gender-pronouns/" class="uri">https://juliasilge.com/blog/gender-pronouns/</a></li>
<li><a href="https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/" class="uri">https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/</a></li>
<li><a href="https://machinelearningmastery.com/what-are-word-embeddings/" class="uri">https://machinelearningmastery.com/what-are-word-embeddings/</a></li>
<li><a href="https://rpubs.com/JanpuHou/396443" class="uri">https://rpubs.com/JanpuHou/396443</a></li>
<li><a href="https://mran.microsoft.com/snapshot/2016-03-05/web/packages/text2vec/vignettes/text-vectorization.html" class="uri">https://mran.microsoft.com/snapshot/2016-03-05/web/packages/text2vec/vignettes/text-vectorization.html</a></li>
<li><a href="https://cbail.github.io/textasdata/word2vec/rmarkdown/word2vec.html" class="uri">https://cbail.github.io/textasdata/word2vec/rmarkdown/word2vec.html</a></li>
<li><a href="https://www.jla-data.net/eng/vocabulary-based-text-classification/" class="uri">https://www.jla-data.net/eng/vocabulary-based-text-classification/</a></li>
<li><a href="http://text2vec.org/glove.html" class="uri">http://text2vec.org/glove.html</a></li>
<li><a href="http://text2vec.org/similarity.html" class="uri">http://text2vec.org/similarity.html</a></li>
<li><a href="https://www.r-craft.org/r-news/get-busy-with-word-embeddings-an-introduction/" class="uri">https://www.r-craft.org/r-news/get-busy-with-word-embeddings-an-introduction/</a></li>
</ul>
</div>
</div>
</section>
</div>
</div>
</div>
<a href="text-processing.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="text-classification.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": ["NLP-book.pdf", "NLP-book.epub"],
"toc": {
"collapse": "subsection"
}
});
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
var src = "true";
if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
if (location.protocol !== "file:")
if (/^https?:/.test(src))
src = src.replace(/^https?:/, '');
script.src = src;
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>