[sentiment] Add tfjs-node training script; Fix padding issue (tensorf…

…low#209) - train.js contains the training logic - train.js uses data.js to load and format the training data - sequence_utils.js is created to house the truncation/padding logic for both the tfjs-node training and in-browser inference - README.md is updated
volzb · Jan 19, 2019 · 6fa70fa · 6fa70fa
1 parent 274758e
commit 6fa70fa
Show file tree

Hide file tree

Showing 11 changed files with 1,066 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -188,12 +188,12 @@ to another project.
     <td><a href="./sentiment">sentiment</a></td>
     <td><a href="https://storage.googleapis.com/tfjs-examples/sentiment/dist/index.html">🔗</a></td>
     <td>Text</td>
-    <td>Sequence-to-regression</td>
+    <td>Sequence-to-binary-prediction</td>
     <td>LSTM, 1D convnet</td>
-    <td></td>
+    <td>Node.js</td>
     <td>Browser</td>
     <td>Layers</td>
-    <td>Loading model converted from Keras</td>
+    <td>Loading model converted from Keras and tfjs-node</td>
   </tr>
   <tr>
     <td><a href="./simple-object-detection">simple-object-detection</a></td>

diff --git a/sentiment/.gitignore b/sentiment/.gitignore
@@ -0,0 +1,4 @@
+*.bin
+*.zip
+model.json
+metadata.json
diff --git a/sentiment/README.md b/sentiment/README.md
@@ -24,3 +24,46 @@ yarn watch
 ```
 
 [See this example live!](https://storage.googleapis.com/tfjs-examples/sentiment/dist/index.html)
+
+## Training your own model in tfjs-node
+
+To train the model using tfjs-node, do
+
+```sh
+yarn
+yarn train <MODEL_TYPE>
+```
+
+where `MODEL_TYPE` is a required argument that specifies what type of model is to be
+trained. The available options are:
+
+- `flatten`: A model that flattens the embedding vectors of all words in the sequence.
+- `cnn`: A 1D convolutional model.
+- `simpleRNN`: A model that uses a SimpleRNN layer (`tf.layers.simpleRNN`)
+- `lstm`: A model that uses a LSTM laayer (`tf.layers.lstm`)
+- `bidirectionalLSTM`: A model that uses a bidirectional LSTM layer
+  (`tf.layers.bidirectional` and `tf.layers.lstm`)
+
+By default, the training happens on the CPU using the Eigen kernels from tfjs-node.
+You can make the training happen on GPU by adding the `--gpu` flag to the command, e.g.,
+
+```sh
+yarn train --gpu <MODEL_TYPE>
+```
+
+The training process will download the training data and metadata form the web
+if they haven't been downloaded before. After the model training completes, the model
+will be saved to the `dist/resources` folder, alongside a `metadata.json` file.
+Then when you run `yarn watch`, you will see a "Load local model" button in the web
+page, which allows you to use the locally-trained model for inference in the browser.
+
+Other arguments of the `yarn train` command include:
+
+- `--maxLen` allows you to specify the sequence length.
+- `--numWords` allows you to specify the vocabulary size.
+- `--embeddingSize` allows you to adjust the dimensionality of the embedding vectors.
+- `--epochs`, `--batchSize`, and `--validationSplit` are training-related settings.
+- `--modelSavePath` allows you to specify where to store the model and metadata after
+  training completes.
+
+The detailed code for training are in the file [train.js](./train.js).
diff --git a/sentiment/data.js b/sentiment/data.js
@@ -0,0 +1,216 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import * as tf from '@tensorflow/tfjs';
+import * as fs from 'fs';
+import * as https from 'https';
+import * as os from 'os';
+import * as path from 'path';
+
+import {OOV_CHAR, PAD_CHAR, padSequences} from './sequence_utils';
+
+// `import` doesn't seem to work with extract-zip.
+const extract = require('extract-zip');
+
+const DATA_ZIP_URL =
+    'https://storage.googleapis.com/learnjs-data/imdb/imdb_tfjs_data.zip';
+const METADATA_TEMPLATE_URL =
+    'https://storage.googleapis.com/learnjs-data/imdb/metadata.json.zip';
+
+/**
+ * Load IMDB data features from a local file.
+ *
+ * @param {string} filePath Data file on local filesystem.
+ * @param {string} numWords Number of words in the vocabulary. Word indices
+ *   that exceed this limit will be marked as `OOV_CHAR`.
+ * @param {string} maxLen Length of each sequence. Longer sequences will be
+ *   pre-truncated; shorter ones will be pre-padded.
+ * @return {tf.Tensor} The dataset represented as a 2D `tf.Tensor` of shape
+ *   `[]` and dtype `int32` .
+ */
+function loadFeatures(filePath, numWords, maxLen) {
+  const buffer = fs.readFileSync(filePath);
+  const numBytes = buffer.byteLength;
+
+  let sequences = [];
+  let seq = [];
+  let index = 0;
+
+  while (index < numBytes) {
+    const value = buffer.readInt32LE(index);
+    if (value === 1) {
+      // A new sequence has started.
+      if (index > 0) {
+        sequences.push(seq);
+      }
+      seq = [];
+    } else {
+      // Sequence continues.
+      seq.push(value >= numWords ? OOV_CHAR : value);
+    }
+    index += 4;
+  }
+  if (seq.length > 0) {
+    sequences.push(seq);
+  }
+  const paddedSequences =
+      padSequences(sequences, maxLen, 'pre', 'pre');
+  return tf.tensor2d(
+      paddedSequences, [paddedSequences.length, maxLen], 'int32');
+}
+
+/**
+ * Load IMDB targets from a file.
+ *
+ * @param {string} filePath Path to the binary targets file.
+ * @return {tf.Tensor} The targets as `tf.Tensor` of shape `[numExamples, 1]`
+ *   and dtype `float32`. It has 0 or 1 values.
+ */
+function loadTargets(filePath) {
+  const buffer = fs.readFileSync(filePath);
+  const numBytes = buffer.byteLength;
+
+  let ys = [];
+  for (let i = 0; i < numBytes; ++i) {
+    ys.push(buffer.readUInt8(i));
+  }
+  return tf.tensor2d(ys, [ys.length, 1], 'float32');
+}
+
+/**
+ * Get a file by downloading it if necessary.
+ *
+ * @param {string} sourceURL URL to download the file from.
+ * @param {string} destPath Destination file path on local filesystem.
+ */
+async function maybeDownload(sourceURL, destPath) {
+  return new Promise(async (resolve, reject) => {
+    if (!fs.existsSync(destPath) || fs.lstatSync(destPath).size === 0) {
+      const localZipFile = fs.createWriteStream(destPath);
+      console.log(`Downloading file from ${sourceURL} ...`);
+      https.get(sourceURL, response => {
+        response.pipe(localZipFile);
+        localZipFile.on('finish', () => {
+          localZipFile.close(async () => {
+            return resolve();
+          });
+        });
+        localZipFile.on('error', err => {
+          return reject(err);
+        });
+      });
+    } else {
+      return resolve();
+    }
+  });
+}
+
+/**
+ * Get extracted files.
+ *
+ * If the files are already extracted, this will be a no-op.
+ *
+ * @param {string} sourcePath Source zip file path.
+ * @param {string} destDir Extraction destination directory.
+ */
+async function maybeExtract(sourcePath, destDir) {
+  return new Promise((resolve, reject) => {
+    if (fs.existsSync(destDir)) {
+      return resolve();
+    }
+    console.log(`Extracting: ${sourcePath} --> ${destDir}`);
+    extract(sourcePath, {dir: destDir}, err => {
+      if (err == null) {
+        return resolve();
+      } else {
+        return reject(err);
+      }
+    });
+  });
+}
+
+const ZIP_SUFFIX = '.zip';
+
+/**
+ * Get the IMDB data through file downloading and extraction.
+ *
+ * If the files already exist on the local file system, the download and/or
+ * extraction steps will be skipped.
+ */
+async function maybeDownloadAndExtract() {
+  const zipDownloadDest = path.join(os.tmpdir(), path.basename(DATA_ZIP_URL));
+  await maybeDownload(DATA_ZIP_URL, zipDownloadDest);
+
+  const zipExtractDir =
+      zipDownloadDest.slice(0, zipDownloadDest.length - ZIP_SUFFIX.length);
+  await maybeExtract(zipDownloadDest, zipExtractDir);
+  return zipExtractDir;
+}
+
+/**
+ * Load data by downloading and extracting files if necessary.
+ *
+ * @param {number} numWords Number of words to in the vocabulary.
+ * @param {number} len Length of each sequence. Longer sequences will
+ *   be pre-truncated and shorter ones will be pre-padded.
+ * @return
+ *   xTrain: Training data as a `tf.Tensor` of shape
+ *     `[numExamples, len]` and `int32` dtype.
+ *   yTrain: Targets for the training data, as a `tf.Tensor` of
+ *     `[numExamples, 1]` and `float32` dtype. The values are 0 or 1.
+ *   xTest: The same as `xTrain`, but for the test dataset.
+ *   yTest: The same as `yTrain`, but for the test dataset.
+ */
+export async function loadData(numWords, len) {
+  const dataDir = await maybeDownloadAndExtract();
+
+  const trainFeaturePath = path.join(dataDir, 'imdb_train_data.bin');
+  const xTrain = loadFeatures(trainFeaturePath, numWords, len);
+  const testFeaturePath = path.join(dataDir, 'imdb_test_data.bin');
+  const xTest = loadFeatures(testFeaturePath, numWords, len);
+  const trainTargetsPath = path.join(dataDir, 'imdb_train_targets.bin');
+  const yTrain = loadTargets(trainTargetsPath);
+  const testTargetsPath = path.join(dataDir, 'imdb_test_targets.bin');
+  const yTest = loadTargets(testTargetsPath);
+
+  tf.util.assert(
+      xTrain.shape[0] === yTrain.shape[0],
+      `Mismatch in number of examples between xTrain and yTrain`);
+  tf.util.assert(
+      xTest.shape[0] === yTest.shape[0],
+      `Mismatch in number of examples between xTest and yTest`);
+  return {xTrain, yTrain, xTest, yTest};
+}
+
+/**
+ * Load a metadata template by downloading and extracting files if necessary.
+ *
+ * @return A JSON object that is the metadata template.
+ */
+export async function loadMetadataTemplate() {
+  const baseName = path.basename(METADATA_TEMPLATE_URL);
+  const zipDownloadDest = path.join(os.tmpdir(), baseName);
+  await maybeDownload(METADATA_TEMPLATE_URL, zipDownloadDest);
+
+  const zipExtractDir =
+      zipDownloadDest.slice(0, zipDownloadDest.length - ZIP_SUFFIX.length);
+  await maybeExtract(zipDownloadDest, zipExtractDir);
+
+  return JSON.parse(fs.readFileSync(
+      path.join(zipExtractDir,
+                baseName.slice(0, baseName.length - ZIP_SUFFIX.length))));
+}
diff --git a/sentiment/index.js b/sentiment/index.js
@@ -18,7 +18,7 @@
 import * as tf from '@tensorflow/tfjs';
 import * as loader from './loader';
 import * as ui from './ui';
-
+import {OOV_CHAR, padSequences} from './sequence_utils';
 
 const HOSTED_URLS = {
   model:
@@ -28,8 +28,8 @@ const HOSTED_URLS = {
 };
 
 const LOCAL_URLS = {
-  model: 'http://localhost:1235/resources/model.json',
-  metadata: 'http://localhost:1235/resources/metadata.json'
+  model: './resources/model.json',
+  metadata: './resources/metadata.json'
 };
 
 class SentimentPredictor {
@@ -52,23 +52,27 @@ class SentimentPredictor {
     console.log('indexFrom = ' + this.indexFrom);
     console.log('maxLen = ' + this.maxLen);
 
-    this.wordIndex = sentimentMetadata['word_index']
+    this.wordIndex = sentimentMetadata['word_index'];
+    this.vocabularySize = sentimentMetadata['vocabulary_size'];
+    console.log('vocabularySize = ', this.vocabularySize);
   }
 
   predict(text) {
     // Convert to lower case and remove all punctuations.
     const inputText =
         text.trim().toLowerCase().replace(/(\.|\,|\!)/g, '').split(' ');
-    // Look up word indices.
-    const inputBuffer = tf.buffer([1, this.maxLen], 'float32');
-    for (let i = 0; i < inputText.length; ++i) {
-      // TODO(cais): Deal with OOV words.
-      const word = inputText[i];
-      inputBuffer.set(this.wordIndex[word] + this.indexFrom, 0, i);
-    }
-    const input = inputBuffer.toTensor();
+    // Convert the words to a sequence of word indices.
+    const sequence = inputText.map(word => {
+      let wordIndex = this.wordIndex[word] + this.indexFrom;
+      if (wordIndex > this.vocabularySize) {
+        wordIndex = OOV_CHAR;
+      }
+      return wordIndex;
+    });
+    // Perform truncation and padding.
+    const paddedSequence = padSequences([sequence], this.maxLen);
+    const input = tf.tensor2d(paddedSequence, [1, this.maxLen]);
 
-    ui.status('Running inference');
     const beginMs = performance.now();
     const predictOut = this.model.predict(input);
     const score = predictOut.dataSync()[0];
@@ -79,7 +83,6 @@ class SentimentPredictor {
   }
 };
 
-
 /**
  * Loads the pretrained model and metadata, and registers the predict
  * function with the UI.

diff --git a/sentiment/package.json b/sentiment/package.json
@@ -9,24 +9,31 @@
     "node": ">=8.9.0"
   },
   "dependencies": {
-    "@tensorflow/tfjs": "^0.14.2",
+    "@tensorflow/tfjs": "0.14.2",
     "vega-embed": "^3.0.0"
   },
   "scripts": {
     "watch": "./serve.sh",
     "build": "cross-env NODE_ENV=production parcel build index.html  --no-minify --public-url ./",
     "link-local": "yalc link",
-    "postinstall": "yarn upgrade --pattern @tensorflow"
+    "postinstall": "yarn upgrade --pattern @tensorflow",
+    "train": "babel-node train.js"
   },
   "devDependencies": {
+    "@tensorflow/tfjs-node": "0.2.3",
+    "@tensorflow/tfjs-node-gpu": "0.2.3",
+    "argparse": "^1.0.10",
+    "babel-cli": "^6.26.0",
     "babel-core": "^6.26.3",
     "babel-plugin-transform-runtime": "~6.23.0",
     "babel-polyfill": "~6.26.0",
     "babel-preset-env": "~1.6.1",
     "clang-format": "~1.2.2",
     "cross-env": "^5.1.6",
+    "extract-zip": "^1.6.7",
     "http-server": "~0.10.0",
     "parcel-bundler": "~1.10.3",
+    "shelljs": "^0.8.3",
     "yalc": "~1.0.0-pre.22"
   }
 }