Word-embeddings.html

<!DOCTYPE html>
<html lang="" xml:lang="">
<head>

  <meta charset="utf-8" />
  <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  <title>Chapter 3 Word embeddings | Natural Language Processing with R</title>
  <meta name="description" content="This is a tutorial of various techniques used in natural language processing and text mining." />
  <meta name="generator" content="bookdown 0.18 and GitBook 2.6.7" />

  <meta property="og:title" content="Chapter 3 Word embeddings | Natural Language Processing with R" />
  <meta property="og:type" content="book" />
  
  
  <meta property="og:description" content="This is a tutorial of various techniques used in natural language processing and text mining." />
  

  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="Chapter 3 Word embeddings | Natural Language Processing with R" />
  
  <meta name="twitter:description" content="This is a tutorial of various techniques used in natural language processing and text mining." />
  

<meta name="author" content="Saif SHabou" />


<meta name="date" content="2020-05-06" />

  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <meta name="apple-mobile-web-app-capable" content="yes" />
  <meta name="apple-mobile-web-app-status-bar-style" content="black" />
  
  
<link rel="prev" href="text-processing.html"/>
<link rel="next" href="text-classification.html"/>
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />


<style type="text/css">
a.sourceLine { display: inline-block; line-height: 1.25; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
a.sourceLine:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource a.sourceLine
  { position: relative; left: -4em; }
pre.numberSource a.sourceLine::before
  { content: attr(title);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; pointer-events: all; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {  }
@media screen {
a.sourceLine::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>

<link rel="stylesheet" href="style.css" type="text/css" />
</head>

<body>


  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">

    <div class="book-summary">
      <nav role="navigation">

<ul class="summary">
<li><a href="./">NLP with R</a></li>

<li class="divider"></li>
<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Introduction</a></li>
<li class="chapter" data-level="2" data-path="text-processing.html"><a href="text-processing.html"><i class="fa fa-check"></i><b>2</b> Text processing</a><ul>
<li class="chapter" data-level="2.1" data-path="text-processing.html"><a href="text-processing.html#text-data"><i class="fa fa-check"></i><b>2.1</b> Text data</a></li>
<li class="chapter" data-level="2.2" data-path="text-processing.html"><a href="text-processing.html#nlp-applications"><i class="fa fa-check"></i><b>2.2</b> NLP applications</a></li>
<li class="chapter" data-level="2.3" data-path="text-processing.html"><a href="text-processing.html#tokenization"><i class="fa fa-check"></i><b>2.3</b> Tokenization</a></li>
<li class="chapter" data-level="2.4" data-path="text-processing.html"><a href="text-processing.html#stop-words-handeling"><i class="fa fa-check"></i><b>2.4</b> Stop words handeling</a></li>
<li class="chapter" data-level="2.5" data-path="text-processing.html"><a href="text-processing.html#words-frequencies"><i class="fa fa-check"></i><b>2.5</b> Words frequencies</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="Word-embeddings.html"><a href="Word-embeddings.html"><i class="fa fa-check"></i><b>3</b> Word embeddings</a><ul>
<li class="chapter" data-level="3.1" data-path="Word-embeddings.html"><a href="Word-embeddings.html#vectorizing-text"><i class="fa fa-check"></i><b>3.1</b> Vectorizing text</a></li>
<li class="chapter" data-level="3.2" data-path="Word-embeddings.html"><a href="Word-embeddings.html#one-hot-encoding"><i class="fa fa-check"></i><b>3.2</b> One-hot encoding</a></li>
<li class="chapter" data-level="3.3" data-path="Word-embeddings.html"><a href="Word-embeddings.html#word-embeddings-methods"><i class="fa fa-check"></i><b>3.3</b> Word embeddings methods</a><ul>
<li class="chapter" data-level="3.3.1" data-path="Word-embeddings.html"><a href="Word-embeddings.html#learn-world-embeddings"><i class="fa fa-check"></i><b>3.3.1</b> Learn world embeddings</a></li>
<li class="chapter" data-level="3.3.2" data-path="Word-embeddings.html"><a href="Word-embeddings.html#pre-trained-word-embeddings"><i class="fa fa-check"></i><b>3.3.2</b> Pre-trained word embeddings</a></li>
</ul></li>
<li class="chapter" data-level="3.4" data-path="Word-embeddings.html"><a href="Word-embeddings.html#applications"><i class="fa fa-check"></i><b>3.4</b> Applications</a><ul>
<li class="chapter" data-level="3.4.1" data-path="Word-embeddings.html"><a href="Word-embeddings.html#using-skip-gram"><i class="fa fa-check"></i><b>3.4.1</b> Using Skip-Gram</a></li>
<li class="chapter" data-level="3.4.2" data-path="Word-embeddings.html"><a href="Word-embeddings.html#using-glove"><i class="fa fa-check"></i><b>3.4.2</b> Using GloVe</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="Word-embeddings.html"><a href="Word-embeddings.html#references"><i class="fa fa-check"></i><b>3.5</b> references</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="text-classification.html"><a href="text-classification.html"><i class="fa fa-check"></i><b>4</b> Text classification</a><ul>
<li class="chapter" data-level="4.1" data-path="text-classification.html"><a href="text-classification.html#load-the-data"><i class="fa fa-check"></i><b>4.1</b> Load the data</a></li>
<li class="chapter" data-level="4.2" data-path="text-classification.html"><a href="text-classification.html#prepare-the-data-for-neural-network"><i class="fa fa-check"></i><b>4.2</b> Prepare the data for neural network</a></li>
<li class="chapter" data-level="4.3" data-path="text-classification.html"><a href="text-classification.html#building-the-model"><i class="fa fa-check"></i><b>4.3</b> Building the model</a></li>
<li class="chapter" data-level="4.4" data-path="text-classification.html"><a href="text-classification.html#testing-the-model"><i class="fa fa-check"></i><b>4.4</b> Testing the model</a></li>
<li class="chapter" data-level="4.5" data-path="text-classification.html"><a href="text-classification.html#reference"><i class="fa fa-check"></i><b>4.5</b> Reference</a></li>
</ul></li>
<li class="chapter" data-level="5" data-path="RNN.html"><a href="RNN.html"><i class="fa fa-check"></i><b>5</b> Reccurent Neural Networks (RNN)</a><ul>
<li class="chapter" data-level="5.1" data-path="RNN.html"><a href="RNN.html#understanding-recurrent-neural-network"><i class="fa fa-check"></i><b>5.1</b> Understanding Recurrent Neural Network</a></li>
<li class="chapter" data-level="5.2" data-path="RNN.html"><a href="RNN.html#rnn-with-keras"><i class="fa fa-check"></i><b>5.2</b> RNN with Keras</a></li>
<li class="chapter" data-level="5.3" data-path="RNN.html"><a href="RNN.html#lstm-with-keras"><i class="fa fa-check"></i><b>5.3</b> LSTM with Keras</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html"><i class="fa fa-check"></i><b>6</b> Sentiment Analysis</a><ul>
<li class="chapter" data-level="6.1" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html#the-sentiments-dataset"><i class="fa fa-check"></i><b>6.1</b> The “Sentiments” dataset</a></li>
<li class="chapter" data-level="6.2" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html#application"><i class="fa fa-check"></i><b>6.2</b> Application</a></li>
<li class="chapter" data-level="6.3" data-path="sentiment-analysis.html"><a href="sentiment-analysis.html#references-1"><i class="fa fa-check"></i><b>6.3</b> References:</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html"><i class="fa fa-check"></i><b>7</b> Word and document frequency (TF-IDF)</a><ul>
<li class="chapter" data-level="7.1" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html#term-frequency-application"><i class="fa fa-check"></i><b>7.1</b> Term frequency application</a></li>
<li class="chapter" data-level="7.2" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html#zipfs-law"><i class="fa fa-check"></i><b>7.2</b> Zipf’s law</a></li>
<li class="chapter" data-level="7.3" data-path="word-and-document-frequency-tf-idf.html"><a href="word-and-document-frequency-tf-idf.html#tf_idf-metric"><i class="fa fa-check"></i><b>7.3</b> TF_IDF metric</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="topic-modeling.html"><a href="topic-modeling.html"><i class="fa fa-check"></i><b>8</b> Topic modeling</a><ul>
<li class="chapter" data-level="8.1" data-path="topic-modeling.html"><a href="topic-modeling.html#latent-dirichlet-allocation"><i class="fa fa-check"></i><b>8.1</b> Latent Dirichlet allocation</a></li>
<li class="chapter" data-level="8.2" data-path="topic-modeling.html"><a href="topic-modeling.html#document-topic-probabilities"><i class="fa fa-check"></i><b>8.2</b> Document-topic probabilities</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html"><i class="fa fa-check"></i><b>9</b> Words’ relationships analysis</a><ul>
<li class="chapter" data-level="9.1" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html#extracting-bi-grams"><i class="fa fa-check"></i><b>9.1</b> Extracting bi-grams</a></li>
<li class="chapter" data-level="9.2" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html#analyzing-bi-grams"><i class="fa fa-check"></i><b>9.2</b> Analyzing bi-grams</a></li>
<li class="chapter" data-level="9.3" data-path="words-relationships-analysis.html"><a href="words-relationships-analysis.html#visualizing-a-network-of-bigrams"><i class="fa fa-check"></i><b>9.3</b> Visualizing a network of bigrams</a></li>
</ul></li>
<li class="chapter" data-level="10" data-path="document-term-matrix.html"><a href="document-term-matrix.html"><i class="fa fa-check"></i><b>10</b> Document-term matrix</a><ul>
<li class="chapter" data-level="10.1" data-path="document-term-matrix.html"><a href="document-term-matrix.html#converting-dtm-into-dataframe"><i class="fa fa-check"></i><b>10.1</b> COnverting DTM into dataframe</a></li>
<li class="chapter" data-level="10.2" data-path="document-term-matrix.html"><a href="document-term-matrix.html#generating-document-term-matrix"><i class="fa fa-check"></i><b>10.2</b> Generating Document-term matrix</a></li>
</ul></li>
<li class="divider"></li>
<li><a href="https://github.com/rstudio/bookdown" target="blank">Published with bookdown</a></li>

</ul>

      </nav>
    </div>

    <div class="book-body">
      <div class="body-inner">
        <div class="book-header" role="navigation">
          <h1>
            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Natural Language Processing with R</a>
          </h1>
        </div>

        <div class="page-wrapper" tabindex="-1" role="main">
          <div class="page-inner">

            <section class="normal" id="section-">
<div id="Word-embeddings" class="section level1">
<h1><span class="header-section-number">Chapter 3</span> Word embeddings</h1>
<p>This section is based on this book: <a href="https://github.com/jjallaire/deep-learning-with-r-notebooks" class="uri">https://github.com/jjallaire/deep-learning-with-r-notebooks</a></p>
<div id="vectorizing-text" class="section level2">
<h2><span class="header-section-number">3.1</span> Vectorizing text</h2>
<p>It is the process of transforming text into numeric tensors. It consists of applying some tokenization scheme and then associating numeric vectors with the generated tokens. The generated vectos are packed into sequence tensors and fed into deep neural network.
There are different ways to associate a vector within a token such as <em>one-hot encoding</em> and <em>token embedding</em> (typically used for words and called <em>word embedding</em>).</p>
</div>
<div id="one-hot-encoding" class="section level2">
<h2><span class="header-section-number">3.2</span> One-hot encoding</h2>
<p>It consists of one-hot encoding the words existing in a sentence based on the whole vocabulary.We create a vector with length equal to the vocabulary and we place a one in the index that corresponds to the word existing in the sentences. Then, we can concatenate the one-hot vectors for each word. This method is considered as inefficient since we obtain a sparse one-hot encoded vector (most indices are zero).</p>
<div class="figure">
<img src="images/one-hot.png" alt="One-hot encoding (source:https://www.tensorflow.org/tutorials/text/word_embeddings)" />
<p class="caption">One-hot encoding (source:<a href="https://www.tensorflow.org/tutorials/text/word_embeddings" class="uri">https://www.tensorflow.org/tutorials/text/word_embeddings</a>)</p>
</div>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb18-1" title="1"><span class="kw">library</span>(keras)</a>
<a class="sourceLine" id="cb18-2" title="2">samples &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;The cat sat on the mat.&quot;</span>, <span class="st">&quot;The dog ate my homework.&quot;</span>)</a>
<a class="sourceLine" id="cb18-3" title="3"><span class="co"># Creates a tokenizer, configured to only take into account the 1,000 </span></a>
<a class="sourceLine" id="cb18-4" title="4"><span class="co"># most common words, then builds the word index.</span></a>
<a class="sourceLine" id="cb18-5" title="5">tokenizer &lt;-<span class="st"> </span><span class="kw">text_tokenizer</span>(<span class="dt">num_words =</span> <span class="dv">1000</span>) <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb18-6" title="6"><span class="st">  </span><span class="kw">fit_text_tokenizer</span>(samples)</a>
<a class="sourceLine" id="cb18-7" title="7"><span class="co"># Turns strings into lists of integer indices</span></a>
<a class="sourceLine" id="cb18-8" title="8">sequences &lt;-<span class="st"> </span><span class="kw">texts_to_sequences</span>(tokenizer, samples)</a>
<a class="sourceLine" id="cb18-9" title="9"><span class="co"># You could also directly get the one-hot binary representations. Vectorization </span></a>
<a class="sourceLine" id="cb18-10" title="10"><span class="co"># modes other than one-hot encoding are supported by this tokenizer.</span></a>
<a class="sourceLine" id="cb18-11" title="11">one_hot_results &lt;-<span class="st"> </span><span class="kw">texts_to_matrix</span>(tokenizer, samples, <span class="dt">mode =</span> <span class="st">&quot;binary&quot;</span>)</a>
<a class="sourceLine" id="cb18-12" title="12"><span class="co"># How you can recover the word index that was computed</span></a>
<a class="sourceLine" id="cb18-13" title="13">word_index &lt;-<span class="st"> </span>tokenizer<span class="op">$</span>word_index</a>
<a class="sourceLine" id="cb18-14" title="14"><span class="kw">cat</span>(<span class="st">&quot;Found&quot;</span>, <span class="kw">length</span>(word_index), <span class="st">&quot;unique tokens.</span><span class="ch">\n</span><span class="st">&quot;</span>)</a></code></pre></div>
<pre><code>## Found 9 unique tokens.</code></pre>
</div>
<div id="word-embeddings-methods" class="section level2">
<h2><span class="header-section-number">3.3</span> Word embeddings methods</h2>
<p>The vectors obtained with one-hot encoding are binary, sparse and very high dimensional (same dimensionality of the number of words in the vocabulary). However, “word embeddings” are low-dimensional dense vectors (as oposite to sparse vectors). They are learned from data. They are commonly 256-dimensional, 512 dimensiona, or 1024-dimensional when dealing with large vocabularies.</p>
<p>There are two methods for obtaining word embedings:</p>
<ul>
<li>Learn word embeddings jointly with a specified task (document classification, sentimenta alnaysis…). For this, we start with random word vectors and learn the word vectors in the same way that we learn the weights of a neural network.</li>
<li>Use a “pre-trained” word embeddings and apply it to our specific task</li>
</ul>
<div id="learn-world-embeddings" class="section level3">
<h3><span class="header-section-number">3.3.1</span> Learn world embeddings</h3>
<p>Word embeddings aim t mapping human language into a geometric space in a way that geometric relationships between word vectors reflect the semantic relationships netween the words. For example, synonyms should be embedded into similar word vectors. We expect that geometric distance between any two word vectors represent semantic distance of the associated words. We can site among common meaningful geometric transformations in word embeddings the “gender vectors” and “plural vectors”. For example, by adding a “female vector” to the vector “king”, we obtain the vector “queen”. In the same way, by adding a “plural vector”, we obtain “kings”. It is hard to find the “ideal” word embedding space to perfectly map general human language. Word embedding performance depends on the task we are working on. A word embedding for Ensglish-language movie review sentiment analysis model may look very different from an English-language legal document classification model since the importance of some semantic relationships varies from task to task.
Therefore, it is useful to learn a new embedding space with every new task. Keras offers the possibility of learning embeddings using <code>layer_embedding()</code>.</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb20-1" title="1"><span class="co"># the embedding layer takes at least two arguments:</span></a>
<a class="sourceLine" id="cb20-2" title="2"><span class="co"># - the number of posssible tokens, here 1000</span></a>
<a class="sourceLine" id="cb20-3" title="3"><span class="co"># - the dimensionality of the embeddings, here 64</span></a>
<a class="sourceLine" id="cb20-4" title="4">embedding_layer =<span class="st"> </span><span class="kw">layer_embedding</span>(<span class="dt">input_dim =</span> <span class="dv">1000</span>, <span class="dt">output_dim =</span> <span class="dv">64</span>)</a></code></pre></div>
<p>The <code>embedding_layer</code> is like a dictionary that maps integer indices to dense vectors. It takes as input a 2D tensor of integers, of shape <code>(samples, sequence_length)</code>, where each entry is a sequence of integers. It generates a 3D floating-point tensor, of shape <code>(samples, sequence_length, embedding_dimensionality</code>.</p>
<p>Let’s apply <code>embedding_layer</code> to the IMDB movie-review sentiment prediction task. We will consider only the top 10,000 most common words and cut off the review after only 20 words. The network will learn 8-dimensional embeddings for each of the 10,000 words, turn the input integer sequences (2D integer tensor) into embedded sequences (3D float tensor), flatten the tensor to 2D, and train a single dense layer on top for classification.</p>
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb21-1" title="1"><span class="kw">library</span>(keras)</a>
<a class="sourceLine" id="cb21-2" title="2"><span class="co"># Number of words to consider as features</span></a>
<a class="sourceLine" id="cb21-3" title="3">max_features =<span class="st"> </span><span class="dv">10000</span></a>
<a class="sourceLine" id="cb21-4" title="4"><span class="co"># cut texts after this number of words (among top max_features most common words)</span></a>
<a class="sourceLine" id="cb21-5" title="5">maxlen =<span class="st"> </span><span class="dv">20</span></a>
<a class="sourceLine" id="cb21-6" title="6"><span class="co"># load the data as lists of integers</span></a>
<a class="sourceLine" id="cb21-7" title="7">imdb =<span class="st"> </span><span class="kw">dataset_imdb</span>(<span class="dt">num_words =</span> max_features)</a>
<a class="sourceLine" id="cb21-8" title="8"><span class="kw">c</span>(<span class="kw">c</span>(x_train, y_train), <span class="kw">c</span>(x_test, y_test)) <span class="op">%&lt;-%</span><span class="st"> </span>imdb</a>
<a class="sourceLine" id="cb21-9" title="9"><span class="co"># This turns our lists of integers</span></a>
<a class="sourceLine" id="cb21-10" title="10"><span class="co"># into a 2D integer tensor of shape `(samples, maxlen)`</span></a>
<a class="sourceLine" id="cb21-11" title="11">x_train =<span class="st"> </span><span class="kw">pad_sequences</span>(x_train, <span class="dt">maxlen =</span> maxlen)</a>
<a class="sourceLine" id="cb21-12" title="12">x_test =<span class="st"> </span><span class="kw">pad_sequences</span>(x_test, <span class="dt">maxlen =</span> maxlen)</a></code></pre></div>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb22-1" title="1"><span class="kw">library</span>(keras)</a>
<a class="sourceLine" id="cb22-2" title="2">model =<span class="st"> </span><span class="kw">keras_model_sequential</span>() <span class="op">%&gt;%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb22-3" title="3"><span class="st">  </span><span class="co"># we specify the maxmum input length to our embedding layer</span></a>
<a class="sourceLine" id="cb22-4" title="4"><span class="st">  </span><span class="co"># so we can later flatten the embedded inputs</span></a>
<a class="sourceLine" id="cb22-5" title="5"><span class="st">  </span><span class="kw">layer_embedding</span>(<span class="dt">input_dim =</span> <span class="dv">10000</span>, <span class="dt">output_dim =</span> <span class="dv">8</span>, <span class="dt">input_length =</span> maxlen) <span class="op">%&gt;%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb22-6" title="6"><span class="st">  </span><span class="co"># we flatten the 3D tensor of embeddings into a 2D tensor of shape (samples, maxlen * 8)</span></a>
<a class="sourceLine" id="cb22-7" title="7"><span class="st">  </span><span class="kw">layer_flatten</span>() <span class="op">%&gt;%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb22-8" title="8"><span class="st">  </span><span class="co"># We add the classifier on top </span></a>
<a class="sourceLine" id="cb22-9" title="9"><span class="st">  </span><span class="kw">layer_dense</span>(<span class="dt">units =</span> <span class="dv">1</span>, <span class="dt">activation =</span> <span class="st">&quot;sigmoid&quot;</span>)</a>
<a class="sourceLine" id="cb22-10" title="10"></a>
<a class="sourceLine" id="cb22-11" title="11">model <span class="op">%&gt;%</span><span class="st">  </span><span class="kw">compile</span>(</a>
<a class="sourceLine" id="cb22-12" title="12">  <span class="dt">optimizer =</span> <span class="st">&quot;rmsprop&quot;</span>,</a>
<a class="sourceLine" id="cb22-13" title="13">  <span class="dt">loss =</span> <span class="st">&quot;binary_crossentropy&quot;</span>,</a>
<a class="sourceLine" id="cb22-14" title="14">  <span class="dt">metrics =</span> <span class="kw">c</span>(<span class="st">&quot;acc&quot;</span>)</a>
<a class="sourceLine" id="cb22-15" title="15">)</a>
<a class="sourceLine" id="cb22-16" title="16"></a>
<a class="sourceLine" id="cb22-17" title="17">history =<span class="st"> </span>model <span class="op">%&gt;%</span><span class="st">  </span><span class="kw">fit</span>(</a>
<a class="sourceLine" id="cb22-18" title="18">  x_train, y_train,</a>
<a class="sourceLine" id="cb22-19" title="19">  <span class="dt">epochs =</span> <span class="dv">10</span>, <span class="co">#10</span></a>
<a class="sourceLine" id="cb22-20" title="20">  <span class="dt">batch_size =</span> <span class="dv">32</span>,</a>
<a class="sourceLine" id="cb22-21" title="21">  <span class="dt">validation_split =</span> <span class="fl">0.2</span></a>
<a class="sourceLine" id="cb22-22" title="22">)</a>
<a class="sourceLine" id="cb22-23" title="23"></a>
<a class="sourceLine" id="cb22-24" title="24"><span class="kw">plot</span>(history)</a></code></pre></div>
<pre><code>## `geom_smooth()` using formula &#39;y ~ x&#39;</code></pre>
<p><img src="NLP-book_files/figure-html/layer_embedding%20-1.png" width="672" /></p>
</div>
<div id="pre-trained-word-embeddings" class="section level3">
<h3><span class="header-section-number">3.3.2</span> Pre-trained word embeddings</h3>
<p>When we have little training data available to learn task-specific word embedding base on our vocabulary, it is preferable to use a pre-trained word embeddings. This technic is simular to transfer learning in image classification tasks, where we use a pretrained classifier. A pre-computed embedding is supposed to capture generic aspects of language structure. These word embeddings are trained based on co-occurence of words in sentences and documents within a large corpus of text. We can distinguish two main powerful word embeddings models: <strong>Word2Vec</strong> and <strong>GloVe</strong>.</p>
<div id="word2vec" class="section level4">
<h4><span class="header-section-number">3.3.2.1</span> Word2Vec</h4>
</div>
<div id="glove" class="section level4">
<h4><span class="header-section-number">3.3.2.2</span> Glove</h4>
</div>
</div>
</div>
<div id="applications" class="section level2">
<h2><span class="header-section-number">3.4</span> Applications</h2>
<div id="using-skip-gram" class="section level3">
<h3><span class="header-section-number">3.4.1</span> Using Skip-Gram</h3>
<p>We use the Amazon Fine Foods Reviews datset which consists of 500,000 reviews of Amazon fine food including product and user information, ratings, and narrative text.
source: <a href="https://blogs.rstudio.com/tensorflow/posts/2017-12-22-word-embeddings-with-keras/" class="uri">https://blogs.rstudio.com/tensorflow/posts/2017-12-22-word-embeddings-with-keras/</a></p>
<div id="getting-the-data" class="section level4">
<h4><span class="header-section-number">3.4.1.1</span> Getting the data</h4>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb24-1" title="1"><span class="co"># we download the data</span></a>
<a class="sourceLine" id="cb24-2" title="2"><span class="kw">download.file</span>(<span class="st">&quot;https://snap.stanford.edu/data/finefoods.txt.gz&quot;</span>, <span class="st">&quot;finefoods.txt.gz&quot;</span>)</a></code></pre></div>
<p>Now we load the plain text reviexs:</p>
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb25-1" title="1"><span class="kw">library</span>(readr)</a>
<a class="sourceLine" id="cb25-2" title="2"><span class="kw">library</span>(stringr)</a>
<a class="sourceLine" id="cb25-3" title="3">reviews &lt;-<span class="st"> </span><span class="kw">read_lines</span>(<span class="st">&quot;finefoods.txt.gz&quot;</span>) </a>
<a class="sourceLine" id="cb25-4" title="4">reviews &lt;-<span class="st"> </span>reviews[<span class="kw">str_sub</span>(reviews, <span class="dv">1</span>, <span class="dv">12</span>) <span class="op">==</span><span class="st"> &quot;review/text:&quot;</span>]</a>
<a class="sourceLine" id="cb25-5" title="5">reviews &lt;-<span class="st"> </span><span class="kw">str_sub</span>(reviews, <span class="dt">start =</span> <span class="dv">14</span>)</a>
<a class="sourceLine" id="cb25-6" title="6">reviews &lt;-<span class="st"> </span><span class="kw">iconv</span>(reviews, <span class="dt">to =</span> <span class="st">&quot;UTF-8&quot;</span>)</a>
<a class="sourceLine" id="cb25-7" title="7"><span class="kw">head</span>(reviews, <span class="dv">2</span>)</a></code></pre></div>
<pre><code>## [1] &quot;I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.&quot;
## [2] &quot;Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as \&quot;Jumbo\&quot;.&quot;</code></pre>
</div>
<div id="preprocessing" class="section level4">
<h4><span class="header-section-number">3.4.1.2</span> Preprocessing</h4>
<p>We use <code>text_tokenizer</code> in order to transform each review into a sequence of integer tokens. By fixing <code>num_words = 20000</code>, we assign integer token to each of the 20,000 most common words (the other words will be assigned to token 0).</p>
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb27-1" title="1"><span class="kw">library</span>(keras)</a>
<a class="sourceLine" id="cb27-2" title="2">tokenizer =<span class="st"> </span><span class="kw">text_tokenizer</span>(<span class="dt">num_words =</span> <span class="dv">20000</span>)</a>
<a class="sourceLine" id="cb27-3" title="3">tokenizer <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">fit_text_tokenizer</span>(reviews)</a>
<a class="sourceLine" id="cb27-4" title="4"><span class="co">#we can show the number of documents</span></a>
<a class="sourceLine" id="cb27-5" title="5">tokenizer<span class="op">$</span>document_count</a></code></pre></div>
<pre><code>## [1] 568454</code></pre>
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb29-1" title="1"><span class="co"># we can show the word index list</span></a>
<a class="sourceLine" id="cb29-2" title="2">tokenizer<span class="op">$</span>word_index <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb29-3" title="3"><span class="st">  </span><span class="kw">head</span>()</a></code></pre></div>
<pre><code>## $the
## [1] 1
## 
## $i
## [1] 2
## 
## $and
## [1] 3
## 
## $a
## [1] 4
## 
## $to
## [1] 5
## 
## $it
## [1] 6</code></pre>
</div>
<div id="skpi-gram-model" class="section level4">
<h4><span class="header-section-number">3.4.1.3</span> Skpi-Gram model</h4>
<p>In the skip-gram model, we use each word as input to a log-linear classifier, then predict words within a certain range before and after this word. It would be very compyationally expensive if we outpt a probability distribution over all the vocabulary for each target word we input in the model. Therefore, we will use negative sampling. It consists of sampling some words that don’t appear i the context and train a binary classifier to predict if the context word we passed is truly from the context or not.</p>
<p>Let’s defin a generator function to yield batches for model training. This genratire function will receive a vector of texts, a tokenizer and the arguments for the skip-gram (the size of the window around each target word we exaine and how manu=y negative samples we ant to sample for each target word).</p>
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb31-1" title="1"><span class="kw">library</span>(reticulate)</a>
<a class="sourceLine" id="cb31-2" title="2"><span class="kw">library</span>(purrr)</a>
<a class="sourceLine" id="cb31-3" title="3">skipgrams_generator &lt;-<span class="st"> </span><span class="cf">function</span>(text, tokenizer, window_size, negative_samples) {</a>
<a class="sourceLine" id="cb31-4" title="4">  gen &lt;-<span class="st"> </span><span class="kw">texts_to_sequences_generator</span>(tokenizer, <span class="kw">sample</span>(text))</a>
<a class="sourceLine" id="cb31-5" title="5">  <span class="cf">function</span>() {</a>
<a class="sourceLine" id="cb31-6" title="6">    skip &lt;-<span class="st"> </span><span class="kw">generator_next</span>(gen) <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb31-7" title="7"><span class="st">      </span><span class="kw">skipgrams</span>(</a>
<a class="sourceLine" id="cb31-8" title="8">        <span class="dt">vocabulary_size =</span> tokenizer<span class="op">$</span>num_words, </a>
<a class="sourceLine" id="cb31-9" title="9">        <span class="dt">window_size =</span> window_size, </a>
<a class="sourceLine" id="cb31-10" title="10">        <span class="dt">negative_samples =</span> <span class="dv">1</span></a>
<a class="sourceLine" id="cb31-11" title="11">      )</a>
<a class="sourceLine" id="cb31-12" title="12">    x &lt;-<span class="st"> </span><span class="kw">transpose</span>(skip<span class="op">$</span>couples) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">map</span>(. <span class="op">%&gt;%</span><span class="st"> </span>unlist <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">as.matrix</span>(<span class="dt">ncol =</span> <span class="dv">1</span>))</a>
<a class="sourceLine" id="cb31-13" title="13">    y &lt;-<span class="st"> </span>skip<span class="op">$</span>labels <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">as.matrix</span>(<span class="dt">ncol =</span> <span class="dv">1</span>)</a>
<a class="sourceLine" id="cb31-14" title="14">    <span class="kw">list</span>(x, y)</a>
<a class="sourceLine" id="cb31-15" title="15">  }</a>
<a class="sourceLine" id="cb31-16" title="16">} </a></code></pre></div>
<p>We define now the keras model using kers functional API.</p>
<div class="sourceCode" id="cb32"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb32-1" title="1"><span class="co"># Dimension of the embedding vector</span></a>
<a class="sourceLine" id="cb32-2" title="2">embedding_size =<span class="st"> </span><span class="dv">128</span> </a>
<a class="sourceLine" id="cb32-3" title="3"><span class="co"># how many words to consider left and right</span></a>
<a class="sourceLine" id="cb32-4" title="4">skip_window =<span class="st"> </span><span class="dv">5</span></a>
<a class="sourceLine" id="cb32-5" title="5"><span class="co"># number of negative examples to sample for each word</span></a>
<a class="sourceLine" id="cb32-6" title="6">num_sampled =<span class="st"> </span><span class="dv">1</span></a></code></pre></div>
<p>We will write placeholders for the inputs using <code>layer_input</code> function</p>
<div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb33-1" title="1">input_target =<span class="st"> </span><span class="kw">layer_input</span>(<span class="dt">shape =</span> <span class="dv">1</span>)</a>
<a class="sourceLine" id="cb33-2" title="2">input_context =<span class="st"> </span><span class="kw">layer_input</span>(<span class="dt">shape =</span> <span class="dv">1</span>)</a></code></pre></div>
<p>Now let’s define the embedding matrix. The embedding is a matrix with dimensions (vocabulary, embedding_size) that acts as lookup table for the word vectors.</p>
<div class="sourceCode" id="cb34"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb34-1" title="1">embedding &lt;-<span class="st"> </span><span class="kw">layer_embedding</span>(</a>
<a class="sourceLine" id="cb34-2" title="2">  <span class="dt">input_dim =</span> tokenizer<span class="op">$</span>num_words <span class="op">+</span><span class="st"> </span><span class="dv">1</span>, </a>
<a class="sourceLine" id="cb34-3" title="3">  <span class="dt">output_dim =</span> embedding_size, </a>
<a class="sourceLine" id="cb34-4" title="4">  <span class="dt">input_length =</span> <span class="dv">1</span>, </a>
<a class="sourceLine" id="cb34-5" title="5">  <span class="dt">name =</span> <span class="st">&quot;embedding&quot;</span></a>
<a class="sourceLine" id="cb34-6" title="6">)</a>
<a class="sourceLine" id="cb34-7" title="7"></a>
<a class="sourceLine" id="cb34-8" title="8">target_vector &lt;-<span class="st"> </span>input_target <span class="op">%&gt;%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb34-9" title="9"><span class="st">  </span><span class="kw">embedding</span>() <span class="op">%&gt;%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb34-10" title="10"><span class="st">  </span><span class="kw">layer_flatten</span>()</a>
<a class="sourceLine" id="cb34-11" title="11"></a>
<a class="sourceLine" id="cb34-12" title="12">context_vector &lt;-<span class="st"> </span>input_context <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb34-13" title="13"><span class="st">  </span><span class="kw">embedding</span>() <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb34-14" title="14"><span class="st">  </span><span class="kw">layer_flatten</span>()</a></code></pre></div>
<p>Now we define how the <code>target_vector</code> will be related to the <code>context_vector</code> in order to make the network output equal to 1 when the context word really appeared in the contexte and 0 otherwise. We want target_vector to be similar to the context_vector if they appeared in the same context. A typical measure of similarity is the cosine similarity. Give two vectors A and B the cosine similarity is defined by the Euclidean Dot product of A and B normalized by their magnitude. As we don’t need the similarity to be normalized inside the network, we will only calculate the dot product and then output a dense layer with sigmoid activation.</p>
<div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb35-1" title="1">dot_product &lt;-<span class="st"> </span><span class="kw">layer_dot</span>(<span class="kw">list</span>(target_vector, context_vector), <span class="dt">axes =</span> <span class="dv">1</span>)</a>
<a class="sourceLine" id="cb35-2" title="2">output &lt;-<span class="st"> </span><span class="kw">layer_dense</span>(dot_product, <span class="dt">units =</span> <span class="dv">1</span>, <span class="dt">activation =</span> <span class="st">&quot;sigmoid&quot;</span>)</a></code></pre></div>
<p>Let’s create and compile the model</p>
<div class="sourceCode" id="cb36"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb36-1" title="1">model &lt;-<span class="st"> </span><span class="kw">keras_model</span>(<span class="kw">list</span>(input_target, input_context), output)</a>
<a class="sourceLine" id="cb36-2" title="2">model <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">compile</span>(<span class="dt">loss =</span> <span class="st">&quot;binary_crossentropy&quot;</span>, <span class="dt">optimizer =</span> <span class="st">&quot;adam&quot;</span>)</a>
<a class="sourceLine" id="cb36-3" title="3"><span class="kw">summary</span>(model)</a></code></pre></div>
<pre><code>## Model: &quot;model&quot;
## ________________________________________________________________________________
## Layer (type)              Output Shape      Param #  Connected to               
## ================================================================================
## input_1 (InputLayer)      [(None, 1)]       0                                   
## ________________________________________________________________________________
## input_2 (InputLayer)      [(None, 1)]       0                                   
## ________________________________________________________________________________
## embedding (Embedding)     (None, 1, 128)    2560128  input_1[0][0]              
##                                                      input_2[0][0]              
## ________________________________________________________________________________
## flatten_1 (Flatten)       (None, 128)       0        embedding[0][0]            
## ________________________________________________________________________________
## flatten_2 (Flatten)       (None, 128)       0        embedding[1][0]            
## ________________________________________________________________________________
## dot (Dot)                 (None, 1)         0        flatten_1[0][0]            
##                                                      flatten_2[0][0]            
## ________________________________________________________________________________
## dense_1 (Dense)           (None, 1)         2        dot[0][0]                  
## ================================================================================
## Total params: 2,560,130
## Trainable params: 2,560,130
## Non-trainable params: 0
## ________________________________________________________________________________</code></pre>
</div>
<div id="model-training" class="section level4">
<h4><span class="header-section-number">3.4.1.4</span> Model training</h4>
<p>To fit the model we need to specify the number of training steps and the number of epochs. We will use only one epoch for time computation reasons.</p>
<div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb38-1" title="1">model <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb38-2" title="2"><span class="st">  </span><span class="kw">fit_generator</span>(</a>
<a class="sourceLine" id="cb38-3" title="3">    <span class="kw">skipgrams_generator</span>(reviews, tokenizer, skip_window, negative_samples), </a>
<a class="sourceLine" id="cb38-4" title="4">    <span class="dt">steps_per_epoch =</span> <span class="dv">2000</span>, <span class="dt">epochs =</span> <span class="dv">2</span></a>
<a class="sourceLine" id="cb38-5" title="5">  )</a></code></pre></div>
<p>We can extract the embedding matrix from the model using the <code>get_weights()</code> function.</p>
<div class="sourceCode" id="cb39"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb39-1" title="1"><span class="kw">library</span>(dplyr)</a>
<a class="sourceLine" id="cb39-2" title="2"></a>
<a class="sourceLine" id="cb39-3" title="3">embedding_matrix &lt;-<span class="st"> </span><span class="kw">get_weights</span>(model)[[<span class="dv">1</span>]]</a>
<a class="sourceLine" id="cb39-4" title="4"></a>
<a class="sourceLine" id="cb39-5" title="5">words &lt;-<span class="st"> </span><span class="kw">data_frame</span>(</a>
<a class="sourceLine" id="cb39-6" title="6">  <span class="dt">word =</span> <span class="kw">names</span>(tokenizer<span class="op">$</span>word_index), </a>
<a class="sourceLine" id="cb39-7" title="7">  <span class="dt">id =</span> <span class="kw">as.integer</span>(<span class="kw">unlist</span>(tokenizer<span class="op">$</span>word_index))</a>
<a class="sourceLine" id="cb39-8" title="8">)</a></code></pre></div>
<pre><code>## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.</code></pre>
<div class="sourceCode" id="cb41"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb41-1" title="1">words &lt;-<span class="st"> </span>words <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb41-2" title="2"><span class="st">  </span><span class="kw">filter</span>(id <span class="op">&lt;=</span><span class="st"> </span>tokenizer<span class="op">$</span>num_words) <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb41-3" title="3"><span class="st">  </span><span class="kw">arrange</span>(id)</a>
<a class="sourceLine" id="cb41-4" title="4"></a>
<a class="sourceLine" id="cb41-5" title="5"><span class="kw">row.names</span>(embedding_matrix) &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;UNK&quot;</span>, words<span class="op">$</span>word)</a>
<a class="sourceLine" id="cb41-6" title="6"><span class="kw">dim</span>(embedding_matrix)</a></code></pre></div>
<pre><code>## [1] 20001   128</code></pre>
<div class="sourceCode" id="cb43"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb43-1" title="1"><span class="kw">head</span>(embedding_matrix)</a></code></pre></div>
<pre><code>##          [,1]         [,2]        [,3]         [,4]        [,5]        [,6]
## UNK 0.0170815 -0.001318716  0.04716846 0.0001694448 -0.01495628  0.01284777
## the 0.1174329 -0.383204609 -0.14486948 0.2978044450  0.17878745 -0.28364837
## i   0.1669428 -0.247277036 -0.21312974 0.3762883544  0.28889671 -0.30660424
## and 0.1605144 -0.230475992 -0.17357136 0.3322810829  0.20980199 -0.25617325
## a   0.1201052 -0.146714032 -0.15440495 0.3936571181  0.22841439 -0.16955599
## to  0.1918591 -0.273127079 -0.17826691 0.3529382646  0.28203753 -0.24766697
##             [,7]        [,8]        [,9]       [,10]         [,11]
## UNK -0.007465005 -0.01517171  0.03539601  0.04036165 -0.0001689792
## the  0.299164832 -0.30537584 -0.36950541 -0.28876448 -0.2681583762
## i    0.362938732 -0.26686862 -0.34998602 -0.23723570 -0.2789547443
## and  0.310600638 -0.21819803 -0.34236768 -0.32119495 -0.1779417843
## a    0.278418452 -0.25963432 -0.35211003 -0.31251714 -0.2506726086
## to   0.347074360 -0.16523284 -0.32140639 -0.28436336 -0.2654981911
##             [,12]       [,13]      [,14]       [,15]       [,16]       [,17]
## UNK -0.0004024878 -0.01031651 -0.0329108 -0.03570246 -0.01811307 -0.02383759
## the  0.2796629667  0.31438842 -0.2472963  0.20960861 -0.28063810  0.17194803
## i    0.3474592566  0.33163065 -0.3115005  0.29770941 -0.32637006  0.09885775
## and  0.3099879324  0.28850329 -0.2122750  0.25450185 -0.35340777  0.15842982
## a    0.3022259772  0.33895627 -0.2288224  0.27653986 -0.23382780  0.15667632
## to   0.3335136473  0.27410549 -0.3449964  0.31313083 -0.33614126  0.02599920
##           [,18]       [,19]       [,20]       [,21]       [,22]       [,23]
## UNK 0.001930606 -0.04853332  0.03184437 -0.03507299  0.03099043 -0.01559343
## the 0.249449074  0.38885596 -0.10571286  0.03303542 -0.01977662  0.12713103
## i   0.266584009  0.32550138 -0.14738728  0.10654508  0.04477475  0.28375289
## and 0.281591505  0.33543691 -0.12878042  0.19387311  0.08222436  0.23661953
## a   0.213139340  0.28658631 -0.13082723  0.11910986  0.03216605  0.19973753
## to  0.144440114  0.35665330 -0.12372198  0.12668315  0.19030270  0.19488603
##           [,24]       [,25]       [,26]       [,27]       [,28]       [,29]
## UNK -0.02061616  0.01281368 -0.03231498 0.007334851  0.03719245  0.03511449
## the -0.17688274 -0.32929045 -0.29310527 0.289794803  0.13527869 -0.27275386
## i   -0.23470533 -0.32637039 -0.26748180 0.318160236  0.03208359 -0.21226105
## and -0.25599492 -0.34276465 -0.33586788 0.217786700  0.02351790 -0.27456364
## a   -0.17987984 -0.37211826 -0.33599785 0.294163078 -0.06172014 -0.27303630
## to  -0.25984651 -0.32187936 -0.29270798 0.297974795 -0.07057865 -0.25982314
##           [,30]         [,31]       [,32]        [,33]       [,34]       [,35]
## UNK -0.01691567  0.0008091219 -0.03200240 -0.006604362 -0.03469641 -0.03299238
## the  0.25344208  0.1719853282  0.07652652 -0.321915329  0.23948634  0.01470774
## i    0.22992177  0.0279782731  0.08346167 -0.304561466  0.12635729  0.02191944
## and  0.18178248  0.1102448180  0.08257330 -0.269136578  0.20793088 -0.09406497
## a    0.18358734  0.2180162519  0.05090325 -0.267941982  0.16697714  0.03600145
## to   0.25895324 -0.0176757276  0.13935305 -0.208568946  0.19998248 -0.05754858
##           [,36]      [,37]       [,38]       [,39]      [,40]        [,41]
## UNK  0.03315267 0.04222684 -0.02326957 -0.03361871 -0.0365397 -0.043440260
## the -0.28853074 0.37395093  0.20946701  0.15802002  0.2907013 -0.010865721
## i   -0.36051789 0.36837849  0.31315809  0.19938451  0.2936096 -0.131256387
## and -0.27480209 0.37666577  0.28615251  0.18097939  0.2794661 -0.006140751
## a   -0.27293894 0.33773720  0.32223794  0.22417280  0.2404891 -0.039677981
## to  -0.30174917 0.30440938  0.29927579  0.14390787  0.2480671 -0.093178861
##          [,42]       [,43]     [,44]       [,45]      [,46]       [,47]
## UNK 0.01521539  0.00477301 0.0318060 -0.02853984  0.0437434 -0.01864365
## the 0.30115995  0.05320331 0.3138930  0.10096217 -0.2055138 -0.20989250
## i   0.35655829 -0.11047912 0.3219035  0.07911555 -0.2606398 -0.12337824
## and 0.33989990  0.04763594 0.2755820  0.04128102 -0.1459931 -0.06884515
## a   0.35712439  0.04051579 0.2893077  0.08757305 -0.1562458 -0.03052964
## to  0.36761856 -0.03275176 0.3164682  0.02745412 -0.2089933 -0.10701507
##           [,48]       [,49]       [,50]       [,51]       [,52]       [,53]
## UNK  0.04101357  0.03473446  0.04457737 -0.00114752 -0.04412064 -0.03156789
## the  0.02520799 -0.25871646 -0.37619755  0.38879156 -0.25250259 -0.39248210
## i    0.01821698 -0.22392237 -0.32261470  0.35049382 -0.27527589 -0.38099816
## and -0.05184063 -0.16932927 -0.33808553  0.37406263 -0.32858366 -0.34304696
## a   -0.09030625 -0.29941657 -0.30359963  0.29772824 -0.26200148 -0.27937773
## to  -0.01727733 -0.24934988 -0.25524116  0.35741013 -0.29380852 -0.32908934
##            [,54]       [,55]       [,56]       [,57]      [,58]       [,59]
## UNK  0.006125987 -0.02187279  0.03910586 -0.01629753 0.04972812  0.03518805
## the -0.310638994  0.36762497 -0.10685650  0.34639886 0.35899246 -0.36781129
## i   -0.349121183  0.37237552 -0.21593590  0.31275219 0.42300200 -0.30578846
## and -0.344224006  0.30466965 -0.15587379  0.32809687 0.32287005 -0.33381802
## a   -0.350583643  0.32805666 -0.11700507  0.27578056 0.28753132 -0.29372326
## to  -0.369859546  0.37795553 -0.16025186  0.28375304 0.33897606 -0.31854972
##          [,60]       [,61]      [,62]        [,63]       [,64]        [,65]
## UNK 0.04667592 -0.01325144 0.04597353  0.007199753  0.04204318  0.004700471
## the 0.27253532  0.14934577 0.30485842 -0.316160858 -0.27030918 -0.083663106
## i   0.23735967  0.11750556 0.31638011 -0.297600389 -0.21566036 -0.096772537
## and 0.18881306  0.06630570 0.31316441 -0.284757823 -0.26432148 -0.110183306
## a   0.23069674  0.08311401 0.23403980 -0.271946669 -0.19995712 -0.172960505
## to  0.29635924  0.04342796 0.25315505 -0.227531537 -0.28717574 -0.092312992
##           [,66]       [,67]       [,68]        [,69]       [,70]      [,71]
## UNK -0.02999171 -0.01793531  0.04247624  0.002061225 -0.02451816 0.02576012
## the -0.11517787  0.12338851 -0.04987039 -0.034545448 -0.06101116 0.05578730
## i    0.06015281  0.01339131  0.00329218 -0.055252384 -0.08577745 0.12629287
## and -0.07259820 -0.03888071  0.01285036 -0.006432456 -0.02521065 0.09755906
## a    0.01760340 -0.11909988  0.08444444 -0.040843900  0.01344274 0.11524677
## to   0.04376663 -0.09150210  0.03593209 -0.064415947 -0.08948220 0.10198851
##            [,72]       [,73]        [,74]       [,75]       [,76]        [,77]
## UNK  0.008057524 -0.01873593  0.004101884 -0.02451124 -0.01045469 -0.007409252
## the -0.210249856  0.24684344 -0.178273425  0.12195436 -0.14832009 -0.229609445
## i   -0.339593619  0.25152388 -0.016085856  0.11553814 -0.24507025 -0.292536318
## and -0.210022196  0.24458480 -0.108235233  0.12692633 -0.15382548 -0.244550496
## a   -0.300737977  0.19017297 -0.177162036  0.09492953 -0.15454216 -0.303135395
## to  -0.386955142  0.19876392 -0.071368732  0.08350065 -0.25868317 -0.269913286
##           [,78]      [,79]       [,80]       [,81]       [,82]       [,83]
## UNK  0.02775276 0.02282996  0.00179093 0.001864087 -0.02228262  0.02817461
## the -0.35284138 0.27658230 -0.26196435 0.198721379 -0.04214166 -0.37608743
## i   -0.37237868 0.28755802 -0.29970258 0.303372979 -0.02192708 -0.34754741
## and -0.32777765 0.19116035 -0.25336358 0.194186181  0.01598708 -0.33635759
## a   -0.30274969 0.23791181 -0.28293476 0.306074739 -0.07584713 -0.36234859
## to  -0.32587340 0.23608799 -0.21567842 0.323878646 -0.03771929 -0.36036131
##            [,84]       [,85]       [,86]        [,87]      [,88]       [,89]
## UNK -0.003465034  0.01472980  0.04781802 -0.007102478 0.04773111 -0.01337481
## the  0.288525522 -0.08883095 -0.22900291 -0.169002935 0.16737778 -0.19447237
## i    0.313215643 -0.19014385 -0.28267121 -0.305975825 0.15154187 -0.13293040
## and  0.332025588 -0.14811261 -0.27446076 -0.231908619 0.10666943 -0.24451743
## a    0.283321589 -0.14821632 -0.25289920 -0.340203524 0.18615662 -0.28705639
## to   0.297496915 -0.17047712 -0.22169787 -0.278337449 0.18933631 -0.13048588
##           [,90]       [,91]       [,92]         [,93]      [,94]       [,95]
## UNK -0.01147924 -0.00587051  0.04609055 -0.0001483187 0.04964909 -0.01832782
## the -0.32560977 -0.17560473 -0.25437045  0.1731803566 0.28145239  0.37646624
## i   -0.37563258 -0.22397560 -0.19170810  0.2082554251 0.29436311  0.39287877
## and -0.30389935 -0.06919212 -0.20327331  0.2024096847 0.29195097  0.38377520
## a   -0.28205562 -0.17666516 -0.23301259  0.2374358177 0.33052328  0.34986836
## to  -0.30297431 -0.15620883 -0.13064004  0.2584783733 0.28516826  0.32652554
##           [,96]       [,97]       [,98]       [,99]      [,100]      [,101]
## UNK 0.007632814 0.005672503 -0.03266094 -0.03422055  0.01324456  0.02385963
## the 0.302464426 0.110885777 -0.25989842  0.38501096 -0.21835038 -0.20524764
## i   0.296282232 0.233367577 -0.20001604  0.38733375 -0.24814697 -0.15178022
## and 0.310684830 0.118035696 -0.17034222  0.33256108 -0.19430402 -0.18163270
## a   0.337718070 0.206213146 -0.24668492  0.32690060 -0.22618128 -0.18086091
## to  0.365883678 0.275636226 -0.25219843  0.31734639 -0.24367192 -0.09583790
##          [,102]      [,103]       [,104]      [,105]      [,106]     [,107]
## UNK -0.04407374 -0.03826056  0.001508869 -0.01589622 -0.02959334 0.02968781
## the  0.19846255  0.21909449 -0.077848770  0.24505678  0.34444517 0.16305423
## i    0.28018263  0.23005924 -0.110724866  0.19277629  0.37343127 0.09825584
## and  0.24288949  0.20457232 -0.084622771  0.14417087  0.26135206 0.06721097
## a    0.29050061  0.30875629 -0.190523997  0.18856290  0.30580518 0.14732127
## to   0.26581255  0.28777468 -0.149579942  0.14878766  0.26804748 0.02433010
##          [,108]      [,109]     [,110]      [,111]      [,112]     [,113]
## UNK 0.032761965 -0.03484957 0.04724887 -0.01673875  0.04846228 0.01651904
## the 0.072625257 -0.22000135 0.29008779  0.21206540 -0.08905456 0.30720186
## i   0.137971386 -0.30346900 0.32460919  0.24432959 -0.04133808 0.37891588
## and 0.067033872 -0.28801164 0.25215000  0.20263389 -0.07493820 0.23705998
## a   0.008180861 -0.27213508 0.28106183  0.26201847 -0.13696785 0.24244435
## to  0.133614138 -0.27308589 0.25494623  0.20437345  0.09366596 0.26203859
##         [,114]      [,115]      [,116]      [,117]      [,118]      [,119]
## UNK 0.03307271 0.006484438  0.04113635 -0.04053799  0.03232178 -0.01916174
## the 0.37055835 0.000467650 -0.33548412 -0.06496470 -0.36024690 -0.24840827
## i   0.32405031 0.068578012 -0.42496726 -0.02733710 -0.32280901 -0.25782296
## and 0.37622270 0.064659454 -0.35911053  0.03896505 -0.33915231 -0.27443057
## a   0.37359667 0.117730454 -0.29758999  0.10123279 -0.32194731 -0.21069331
## to  0.32364631 0.143883005 -0.31454709  0.14705095 -0.27599317 -0.22744414
##          [,120]      [,121]      [,122]     [,123]      [,124]       [,125]
## UNK -0.00625832 -0.04583708  0.01545075 0.03551065 -0.02845714 -0.024710560
## the  0.23234977  0.37063292 -0.06700579 0.18953991 -0.30410570 -0.069787078
## i    0.36668271  0.31222266 -0.09937420 0.27160498 -0.34567764 -0.001867783
## and  0.33828005  0.33334142 -0.06587321 0.30039573 -0.28826779  0.067308143
## a    0.30670270  0.39588228 -0.05656852 0.15388277 -0.30768496  0.107387684
## to   0.30960637  0.33873239 -0.03573503 0.23836206 -0.29423913  0.053528536
##         [,126]     [,127]      [,128]
## UNK 0.03197979 0.02286074 -0.04385829
## the 0.36302400 0.25149447  0.23504810
## i   0.39596969 0.26540828  0.26525143
## and 0.33469549 0.24421459  0.21839896
## a   0.31807998 0.26717538  0.26289040
## to  0.36336777 0.25848782  0.25542757</code></pre>
</div>
<div id="understanding-the-embeddings" class="section level4">
<h4><span class="header-section-number">3.4.1.5</span> Understanding the embeddings</h4>
<p>We can now find words that are close to each other in the embedding. We will use the cosine similarity, since this is what we trained the model to minimize.</p>
<div class="sourceCode" id="cb45"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb45-1" title="1"><span class="kw">library</span>(text2vec)</a></code></pre></div>
<pre><code>## 
## Attaching package: &#39;text2vec&#39;</code></pre>
<pre><code>## The following objects are masked from &#39;package:keras&#39;:
## 
##     fit, normalize</code></pre>
<div class="sourceCode" id="cb48"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb48-1" title="1">find_similar_words &lt;-<span class="st"> </span><span class="cf">function</span>(word, embedding_matrix, <span class="dt">n =</span> <span class="dv">5</span>) {</a>
<a class="sourceLine" id="cb48-2" title="2">  similarities &lt;-<span class="st"> </span>embedding_matrix[word, , drop =<span class="st"> </span><span class="ot">FALSE</span>] <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb48-3" title="3"><span class="st">    </span><span class="kw">sim2</span>(embedding_matrix, <span class="dt">y =</span> ., <span class="dt">method =</span> <span class="st">&quot;cosine&quot;</span>)</a>
<a class="sourceLine" id="cb48-4" title="4">  </a>
<a class="sourceLine" id="cb48-5" title="5">  similarities[,<span class="dv">1</span>] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sort</span>(<span class="dt">decreasing =</span> <span class="ot">TRUE</span>) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">head</span>(n)</a>
<a class="sourceLine" id="cb48-6" title="6">}</a>
<a class="sourceLine" id="cb48-7" title="7"></a>
<a class="sourceLine" id="cb48-8" title="8"></a>
<a class="sourceLine" id="cb48-9" title="9"><span class="kw">find_similar_words</span>(<span class="st">&quot;delicious&quot;</span>, embedding_matrix)</a></code></pre></div>
<pre><code>## delicious    bought     green   texture     price 
## 1.0000000 0.9809152 0.9789813 0.9783692 0.9781281</code></pre>
<div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb50-1" title="1"><span class="kw">find_similar_words</span>(<span class="st">&quot;cats&quot;</span>, embedding_matrix)</a></code></pre></div>
<pre><code>##      cats chocolate      best       too       bag 
## 1.0000000 0.9785330 0.9782802 0.9773057 0.9770379</code></pre>
<p>The t-SNE algorithm can be used to visualize the embeddings. Because of time constraints we will only use it with the first 500 words. o understand more about the t-SNE method see the article: <a href="https://distill.pub/2016/misread-tsne/" class="uri">https://distill.pub/2016/misread-tsne/</a></p>
<div class="sourceCode" id="cb52"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb52-1" title="1"><span class="kw">library</span>(Rtsne)</a>
<a class="sourceLine" id="cb52-2" title="2"><span class="kw">library</span>(ggplot2)</a>
<a class="sourceLine" id="cb52-3" title="3"><span class="kw">library</span>(plotly)</a></code></pre></div>
<pre><code>## 
## Attaching package: &#39;plotly&#39;</code></pre>
<pre><code>## The following object is masked from &#39;package:ggplot2&#39;:
## 
##     last_plot</code></pre>
<pre><code>## The following object is masked from &#39;package:stats&#39;:
## 
##     filter</code></pre>
<pre><code>## The following object is masked from &#39;package:graphics&#39;:
## 
##     layout</code></pre>
<div class="sourceCode" id="cb57"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb57-1" title="1">tsne &lt;-<span class="st"> </span><span class="kw">Rtsne</span>(embedding_matrix[<span class="dv">2</span><span class="op">:</span><span class="dv">500</span>,], <span class="dt">perplexity =</span> <span class="dv">50</span>, <span class="dt">pca =</span> <span class="ot">FALSE</span>)</a>
<a class="sourceLine" id="cb57-2" title="2"></a>
<a class="sourceLine" id="cb57-3" title="3">tsne_plot &lt;-<span class="st"> </span>tsne<span class="op">$</span>Y <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb57-4" title="4"><span class="st">  </span><span class="kw">as.data.frame</span>() <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb57-5" title="5"><span class="st">  </span><span class="kw">mutate</span>(<span class="dt">word =</span> <span class="kw">row.names</span>(embedding_matrix)[<span class="dv">2</span><span class="op">:</span><span class="dv">500</span>]) <span class="op">%&gt;%</span></a>
<a class="sourceLine" id="cb57-6" title="6"><span class="st">  </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> V1, <span class="dt">y =</span> V2, <span class="dt">label =</span> word)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb57-7" title="7"><span class="st">  </span><span class="kw">geom_text</span>(<span class="dt">size =</span> <span class="dv">3</span>)</a>
<a class="sourceLine" id="cb57-8" title="8">tsne_plot</a></code></pre></div>
<p><img src="NLP-book_files/figure-html/Rtsne%20-1.png" width="672" /></p>
</div>
</div>
<div id="using-glove" class="section level3">
<h3><span class="header-section-number">3.4.2</span> Using GloVe</h3>
<p>source: <a href="http://text2vec.org/glove.html" class="uri">http://text2vec.org/glove.html</a></p>
<p>In this example, we will use GloVe to test how much it captures linguistic regularities. By takig the word vectors corresponding to the words: “Paris”, “france”, and “gremany”, we are supposed to obtain “berlin” as closest resulting vector.
<span class="math inline">\(vector(&quot;paris&quot;) - vector(&quot;france) + vector(&quot;germany&quot;)\)</span></p>
<p>we will use the wikpiedeia data which is used as a demo by wor2vec.</p>
<div class="sourceCode" id="cb58"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb58-1" title="1"><span class="co"># download data</span></a>
<a class="sourceLine" id="cb58-2" title="2"><span class="co"># download.file(&quot;http://mattmahoney.net/dc/text8.zip&quot;, &quot;D:/NLP/NLP-book/data/text8.zip&quot;)</span></a>
<a class="sourceLine" id="cb58-3" title="3"><span class="co"># unzip(&quot;D:/NLP/NLP-book/data/text8.zip&quot;, files = &quot;text8&quot;, exdir = &quot;D:/NLP/NLP-book/data/text8&quot;)</span></a>
<a class="sourceLine" id="cb58-4" title="4"><span class="co"># load data</span></a>
<a class="sourceLine" id="cb58-5" title="5">wiki =<span class="st"> </span><span class="kw">readLines</span>(<span class="st">&quot;D:/NLP/NLP-book/data/text8/text8&quot;</span>, <span class="dt">n =</span> <span class="dv">1</span>, <span class="dt">warn =</span> <span class="ot">FALSE</span>)</a></code></pre></div>
<p>Now, we create a vocabulary constituted of set of words for wich we want to learn word vectors.</p>
<div class="sourceCode" id="cb59"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb59-1" title="1"><span class="co"># Create iterator over tokens</span></a>
<a class="sourceLine" id="cb59-2" title="2">tokens &lt;-<span class="st"> </span><span class="kw">space_tokenizer</span>(wiki)</a>
<a class="sourceLine" id="cb59-3" title="3"><span class="co"># Create vocabulary. Terms will be unigrams (simple words).</span></a>
<a class="sourceLine" id="cb59-4" title="4">it =<span class="st"> </span><span class="kw">itoken</span>(tokens, <span class="dt">progressbar =</span> <span class="ot">FALSE</span>)</a>
<a class="sourceLine" id="cb59-5" title="5">vocab &lt;-<span class="st"> </span><span class="kw">create_vocabulary</span>(it)</a>
<a class="sourceLine" id="cb59-6" title="6"><span class="kw">str</span>(vocab)</a></code></pre></div>
<pre><code>## Classes &#39;text2vec_vocabulary&#39; and &#39;data.frame&#39;:  253854 obs. of  3 variables:
##  $ term      : chr  &quot;aaaaaacceglllnorst&quot; &quot;aaaaaaccegllnorrst&quot; &quot;aaaaaah&quot; &quot;aaaaaalmrsstt&quot; ...
##  $ term_count: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ doc_count : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, &quot;ngram&quot;)= Named int  1 1
##   ..- attr(*, &quot;names&quot;)= chr  &quot;ngram_min&quot; &quot;ngram_max&quot;
##  - attr(*, &quot;document_count&quot;)= int 1
##  - attr(*, &quot;stopwords&quot;)= chr 
##  - attr(*, &quot;sep_ngram&quot;)= chr &quot;_&quot;</code></pre>
<p>We should remove unbommon words since it is not meaningful to keep word vector for word that we saw only once in the entire corpus. In this example we will keep only ords which apear at least five times.</p>
<div class="sourceCode" id="cb61"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb61-1" title="1">vocab &lt;-<span class="st"> </span><span class="kw">prune_vocabulary</span>(vocab, <span class="dt">term_count_min =</span> 5L)</a>
<a class="sourceLine" id="cb61-2" title="2"><span class="kw">min</span>(vocab<span class="op">$</span>term_count)</a></code></pre></div>
<pre><code>## [1] 5</code></pre>
<p>Now we have 71,290 terms in the vocabulary and are ready to construct term-co-occurence matrix (TCM).</p>
<div class="sourceCode" id="cb63"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb63-1" title="1"><span class="co"># Use our filtered vocabulary</span></a>
<a class="sourceLine" id="cb63-2" title="2">vectorizer &lt;-<span class="st"> </span><span class="kw">vocab_vectorizer</span>(vocab)</a>
<a class="sourceLine" id="cb63-3" title="3"><span class="co"># use window of 5 for context words</span></a>
<a class="sourceLine" id="cb63-4" title="4">tcm &lt;-<span class="st"> </span><span class="kw">create_tcm</span>(it, vectorizer, <span class="dt">skip_grams_window =</span> 5L)</a>
<a class="sourceLine" id="cb63-5" title="5">tcm[<span class="dv">1</span><span class="op">:</span><span class="dv">10</span>, <span class="dv">1</span><span class="op">:</span><span class="dv">10</span>]</a></code></pre></div>
<pre><code>## 10 x 10 sparse Matrix of class &quot;dgTMatrix&quot;</code></pre>
<pre><code>##    [[ suppressing 10 column names &#39;aapke&#39;, &#39;ababda&#39;, &#39;abakumov&#39; ... ]]</code></pre>
<pre><code>##                                   
## aapke       . . . . . . .    . . .
## ababda      . . . . . . .    . . .
## abakumov    . . . . . . .    . . .
## abalones    . . . . . . .    . . .
## abano       . . . . . . .    . . .
## abati       . . . . . . .    . . .
## abbates     . . . . . . 1.25 . . .
## abbesses    . . . . . . .    . . .
## abderus     . . . . . . .    . 1 .
## abdications . . . . . . .    . . .</code></pre>
<p>Now we have a TCM matrix and can factorize it via the GloVe algorithm.</p>
<div class="sourceCode" id="cb67"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb67-1" title="1">glove =<span class="st"> </span>GlobalVectors<span class="op">$</span><span class="kw">new</span>(<span class="dt">rank =</span> <span class="dv">50</span>, <span class="dt">x_max =</span> <span class="dv">10</span>)</a>
<a class="sourceLine" id="cb67-2" title="2">wv_main =<span class="st"> </span>glove<span class="op">$</span><span class="kw">fit_transform</span>(tcm, <span class="dt">n_iter =</span> <span class="dv">10</span>, <span class="dt">convergence_tol =</span> <span class="fl">0.01</span>)</a></code></pre></div>
<pre><code>## INFO  [23:50:33.466] epoch 1, loss 0.1745 
## INFO  [23:50:47.080] epoch 2, loss 0.1224 
## INFO  [23:51:00.388] epoch 3, loss 0.1083 
## INFO  [23:51:13.728] epoch 4, loss 0.1004 
## INFO  [23:51:27.567] epoch 5, loss 0.0953 
## INFO  [23:51:40.924] epoch 6, loss 0.0917 
## INFO  [23:51:54.514] epoch 7, loss 0.0889 
## INFO  [23:52:08.667] epoch 8, loss 0.0868 
## INFO  [23:52:22.352] epoch 9, loss 0.0850 
## INFO  [23:52:35.699] epoch 10, loss 0.0836</code></pre>
<div class="sourceCode" id="cb69"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb69-1" title="1"><span class="kw">dim</span>(wv_main)</a></code></pre></div>
<pre><code>## [1] 71290    50</code></pre>
<p>Note that model learns two sets of word vectors - main and context. Essentially they are the same since model is symmetric. From our experience learning two sets of word vectors leads to higher quality embeddings.</p>
<div class="sourceCode" id="cb71"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb71-1" title="1">wv_context =<span class="st"> </span>glove<span class="op">$</span>components</a>
<a class="sourceLine" id="cb71-2" title="2"><span class="kw">dim</span>(wv_context)</a></code></pre></div>
<pre><code>## [1]    50 71290</code></pre>
<p>While both of word-vectors matrices can be used as result it usually better (idea from GloVe paper) to average or take a sum of main and context vector:</p>
<div class="sourceCode" id="cb73"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb73-1" title="1">word_vectors =<span class="st"> </span>wv_main <span class="op">+</span><span class="st"> </span><span class="kw">t</span>(wv_context)</a></code></pre></div>
<p>We can find the closest word vectors for our paris - france + germany example:</p>
<div class="sourceCode" id="cb74"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb74-1" title="1">berlin =<span class="st"> </span>word_vectors[<span class="st">&quot;paris&quot;</span>, , drop =<span class="st"> </span><span class="ot">FALSE</span>] <span class="op">-</span><span class="st"> </span></a>
<a class="sourceLine" id="cb74-2" title="2"><span class="st">  </span>word_vectors[<span class="st">&quot;france&quot;</span>, , drop =<span class="st"> </span><span class="ot">FALSE</span>] <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb74-3" title="3"><span class="st">  </span>word_vectors[<span class="st">&quot;germany&quot;</span>, , drop =<span class="st"> </span><span class="ot">FALSE</span>]</a>
<a class="sourceLine" id="cb74-4" title="4">cos_sim =<span class="st"> </span><span class="kw">sim2</span>(<span class="dt">x =</span> word_vectors, <span class="dt">y =</span> berlin, <span class="dt">method =</span> <span class="st">&quot;cosine&quot;</span>, <span class="dt">norm =</span> <span class="st">&quot;l2&quot;</span>)</a>
<a class="sourceLine" id="cb74-5" title="5"><span class="kw">head</span>(<span class="kw">sort</span>(cos_sim[,<span class="dv">1</span>], <span class="dt">decreasing =</span> <span class="ot">TRUE</span>), <span class="dv">5</span>)</a></code></pre></div>
<pre><code>##     paris    berlin      bonn    london   leipzig 
## 0.7771973 0.7295444 0.6742783 0.6663386 0.6612857</code></pre>
</div>
</div>
<div id="references" class="section level2">
<h2><span class="header-section-number">3.5</span> references</h2>
<ul>
<li><a href="http://pablobarbera.com/ECPR-SC105/code/16-word-embeddings.html" class="uri">http://pablobarbera.com/ECPR-SC105/code/16-word-embeddings.html</a></li>
<li><a href="https://code.google.com/archive/p/word2vec/" class="uri">https://code.google.com/archive/p/word2vec/</a></li>
<li><a href="https://m-clark.github.io/text-analysis-with-R/word-embeddings.html#wikipedia" class="uri">https://m-clark.github.io/text-analysis-with-R/word-embeddings.html#wikipedia</a></li>
<li><a href="https://juliasilge.com/blog/gender-pronouns/" class="uri">https://juliasilge.com/blog/gender-pronouns/</a></li>
<li><a href="https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/" class="uri">https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/</a></li>
<li><a href="https://machinelearningmastery.com/what-are-word-embeddings/" class="uri">https://machinelearningmastery.com/what-are-word-embeddings/</a></li>
<li><a href="https://rpubs.com/JanpuHou/396443" class="uri">https://rpubs.com/JanpuHou/396443</a></li>
<li><a href="https://mran.microsoft.com/snapshot/2016-03-05/web/packages/text2vec/vignettes/text-vectorization.html" class="uri">https://mran.microsoft.com/snapshot/2016-03-05/web/packages/text2vec/vignettes/text-vectorization.html</a></li>
<li><a href="https://cbail.github.io/textasdata/word2vec/rmarkdown/word2vec.html" class="uri">https://cbail.github.io/textasdata/word2vec/rmarkdown/word2vec.html</a></li>
<li><a href="https://www.jla-data.net/eng/vocabulary-based-text-classification/" class="uri">https://www.jla-data.net/eng/vocabulary-based-text-classification/</a></li>
<li><a href="http://text2vec.org/glove.html" class="uri">http://text2vec.org/glove.html</a></li>
<li><a href="http://text2vec.org/similarity.html" class="uri">http://text2vec.org/similarity.html</a></li>
<li><a href="https://www.r-craft.org/r-news/get-busy-with-word-embeddings-an-introduction/" class="uri">https://www.r-craft.org/r-news/get-busy-with-word-embeddings-an-introduction/</a></li>
</ul>

</div>
</div>
            </section>

          </div>
        </div>
      </div>
<a href="text-processing.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="text-classification.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
    </div>
  </div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": ["NLP-book.pdf", "NLP-book.epub"],
"toc": {
"collapse": "subsection"
}
});
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    var src = "true";
    if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
    if (location.protocol !== "file:")
      if (/^https?:/.test(src))
        src = src.replace(/^https?:/, '');
    script.src = src;
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>
</body>

</html>