Merge pull request #1 from cadia-lvl/fairseq-g2p

Add fairseq models
cadia-lvl · Nov 27, 2020 · 1b8ed07 · 1b8ed07
2 parents e481792 + 2fa2f29
commit 1b8ed07
Show file tree

Hide file tree

Showing 5 changed files with 187 additions and 11 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,9 +1,11 @@
 FROM python:3.7-slim
 RUN apt-get -yqq update && apt-get install -yqq g++ libopenblas-base libopenblas-dev swig
-COPY . /app
+ENV G2P_MODEL_DIR=/app/fairseq_g2p/
 WORKDIR /app
+COPY requirements.txt ./
 RUN pip install -r requirements.txt
 EXPOSE 8000
 VOLUME /app/final.mdl
+COPY . /app
 ENTRYPOINT ["gunicorn", "--bind", "0.0.0.0:8000", "--access-logfile", "-", "--error-logfile", "-"]
 CMD ["app:app"]
diff --git a/README.md b/README.md
@@ -1,12 +1,13 @@
 # g2p-service
 
 Naive Flask wrapper for
-[Sequitur](https://github.com/sequitur-g2p/sequitur-g2p). Exposes a simple REST
-API.
+[Sequitur](https://github.com/sequitur-g2p/sequitur-g2p) and [fairseq g2p
+models](https://github.com/grammatek/g2p-lstm). Exposes a simple REST API.
 
 ## Usage
 Example service endpoint for Icelandic available at
-https://nlp.talgreinir.is/pron (courtesy of [Tiro](https://tiro.is))
+https://nlp.talgreinir.is/pron (courtesy of [Tiro](https://tiro.is)) - does not
+support fairseq
 
 How do I pronounce `derp`?
 
@@ -51,6 +52,56 @@ Multiple word support with a POST.
 
 Append `?t=tsv` to get the response in the Kaldi lexicon format.
 
+Append ?m=fairseq to use the fairseq model instead of the sequitur model
+
+    $ cat <<EOF | curl -XPOST -d@- "http://localhost:8000/pron?m=fairseq" | jq
+    {"words": ["herp", "derp"]}
+    EOF
+    [
+      {
+        "results": [
+          {
+            "pronunciation": "h E r_0 p"
+          }
+        ],
+        "word": "herp"
+      },
+      {
+        "results": [
+          {
+            "pronunciation": "t E r_0 p"
+          }
+        ],
+        "word": "derp"
+      }
+    ]
+
+Append ?d=north to use the northern dialect
+Append ?d=north_east to use the north eastern dialect
+Append ?d=south to use the southern dialect
+
+    $ cat <<EOF | curl -XPOST -d@- "http://localhost:8000/pron?m=fairseq&d=south" | jq
+    {"words": ["herp", "akureyri"]}
+    EOF
+    [
+      {
+        "results": [
+          {
+            "pronunciation": "h E r_0 p"
+          }
+        ],
+        "word": "herp"
+      },
+      {
+        "results": [
+          {
+            "pronunciation": "a: k Y r ei r I"
+          }
+        ],
+        "word": "akureyri"
+      }
+    ]
+
 ## Steps
 
 ### Build Docker image
@@ -63,9 +114,16 @@ Train, or somehow acquire a Sequitur G2P model expose it to the container as
 
     docker run -p 8000:8000 -v <path-to-model>:/app/final.mdl g2p-service
 
+    docker run -p 8000:8000 -v <path-to-model>:/app/final.mdl -v <path-to-grammatek-lstm-g2p-repo>:/app/fairseq_g2p/ g2p-service
+
+
+Example
+    docker run -it --rm -v ${PWD}/final.mdl:/app/final.mdl -v /home/judyfong/g2p-lstm:/app/fairseq_g2p g2p-service
+
 ## LICENSE
 
     Copyright (C) 2019  Róbert Kjaran <[email protected]>
+    Copyright (C) 2020  Judy Y Fong <[email protected]>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by

diff --git a/app.py b/app.py
@@ -2,14 +2,16 @@
 import math
 from flask import Flask, request, jsonify, Response
 from flask_cors import CORS
-from flask_autodoc import Autodoc
 
 from g2p import SequiturTool, Translator, loadG2PSample
+from fairseq_g2p import Fairseq_graphemetophoneme as fs_g2p
 
 app = Flask(__name__)
 app.config["JSON_AS_ASCII"] = False
 CORS(app)
-auto = Autodoc(app)
+
+# TODO: only load the sequitur model once
+# TODO: python class template and children for sequitur, fairseq, and thrax?
 
 
 class Options(dict):
@@ -28,6 +30,7 @@ def __getattr__(self, name):
     def __setattr__(self, name, value):
         self[name] = value
 
+grammatek_lstm = fs_g2p()
 
 def pronounce(words):
     options = Options(
@@ -73,29 +76,47 @@ def pron_to_tsv(prons):
 
 
 @app.route("/pron/<word>", methods=["GET", "OPTIONS"])
-@auto.doc()
 def route_pronounce(word):
     """Main entry point - Does the important stuff
     """
+    m = request.args.get("m")
+    if m and m == "fairseq":
+        gen_pronounce = grammatek_lstm.pronounce
+    else:
+        gen_pronounce = pronounce
+    # TODO: make fairseq models work with tsv
     t = request.args.get("t")
     if t and t == "tsv":
         return Response(response=pron_to_tsv(pronounce([word])),
                         status=200,
                         content_type="text/tab-separated-values")
 
-    return jsonify(list(pronounce([word]))), 200
+    d = request.args.get("d")
+    if d and d in grammatek_lstm.possible_dialects:
+        return jsonify(list(gen_pronounce([word], d))), 200
+    return jsonify(list(gen_pronounce([word]))), 200
 
 
 @app.route("/pron", methods=["POST", "OPTIONS"])
-@auto.doc()
 def route_pronounce_many():
     content = request.get_json(force=True)
     if "words" not in content:
         return jsonify({"error": "Field 'words' missing."}), 400
 
+    m = request.args.get("m")
+    if m and m == "fairseq":
+        # d = request.args.get("d")
+        # if d and d in grammatek_lstm.possible_dialects:
+        gen_pronounce = grammatek_lstm.pronounce
+    else:
+        gen_pronounce = pronounce
+    # TODO: make fairseq models work with tsv
     t = request.args.get("t")
     if t and t == "tsv":
         return Response(response=pron_to_tsv(pronounce(content["words"])),
                         status=200,
                         content_type="text/tab-separated-values")
-    return jsonify(list(pronounce(content["words"]))), 200
+    d = request.args.get("d")
+    if d and d in grammatek_lstm.possible_dialects:
+        return jsonify(list(gen_pronounce(content["words"], d))), 200
+    return jsonify(list(gen_pronounce(content["words"]))), 200
diff --git a/fairseq_g2p.py b/fairseq_g2p.py
@@ -0,0 +1,92 @@
+# Copyright (c) Judy Y. Fong <[email protected]>
+#
+# This g2p source code is licensed under the GPL-2.0 License found in the LICENSE
+# file in the root directory of this source tree.
+
+import fairseq
+import torch
+from fairseq.models.transformer import TransformerModel
+import os
+
+class Fairseq_graphemetophoneme:
+    def __init__(self):
+        self.possible_dialects = ['standard', 'north' , 'north_east', 'south']
+        self.dialect_models = {}
+
+        model_dir = os.getenv("G2P_MODEL_DIR", "/app/fairseq_g2p/")
+        """ Select the paths based on dialect """
+        for dialect in self.possible_dialects:
+            data_dir = model_dir + '/data-bin/' + dialect
+            checkpoint_file = model_dir + '/checkpoints/' + dialect + \
+            '-256-.3-s-s/checkpoint_last.pt'
+            self.dialect_models[dialect] = \
+            TransformerModel.from_pretrained(data_dir, checkpoint_file)
+
+    # Function to change 'hlaupa' to 'h l a u p a' etc
+    def words2spaced(self, normal_words):
+        """
+        Change normal words to words with spaces between letters
+
+             e.g. hlaupa to h l a u p a
+        """
+        separated = []
+        for word in normal_words:
+            separated.append(' '.join(char for char in word))
+        return separated
+
+    def examples(self):
+        """
+        Print out examples of the output from fairseq g2p models from grammatek
+        """
+        # Process phrase to work with g2p functioon
+        # TODO: remove punctuation because it affects the output
+        # phrase = 'Velkomin til íslands.'
+        # phrase = 'Velkomin til íslands'
+        phrase = 'What is up Charlie Zinger Queen'
+        # Change a phrase to a list of words with .split()
+        phrase_spaced = self.words2spaced(phrase.split())
+
+        # Process words to work with g2p function
+        h_l_a_u_p_a = self.words2spaced(['hlaupa'])
+        processed = self.words2spaced(
+            ['Hlaupa', 'derp', 'orð', 'hrafn', 'daginn', 'Akureyri', 'banki']
+        )
+
+        # works with c, w, q, and z
+        # g2p works with lowercased and capital letters
+        # NOTE: punctuation just gives random output so shouldn't allow it to be
+        # passed to self.dialect_models[dialect].translate()
+        dialect = "standard"
+        print(self.dialect_models[dialect].translate(h_l_a_u_p_a))
+        # ['l_0 9i: p a']
+        print(self.dialect_models[dialect].translate(processed))
+        # ['l_0 9i: p a', 't E r_0 p']
+        print(self.dialect_models[dialect].translate(phrase_spaced))
+        # ['c E l k_h O m I n', 't_h I: l', 'i s t l a n t s']
+
+        print('\nnorth')
+        print(self.dialect_models["north"].translate(processed))
+        print('\nnorth east')
+        print(self.dialect_models["north_east"].translate(processed))
+        print('\nsouth')
+        print(self.dialect_models["south"].translate(processed))
+
+    # ['hlaupa','orð', 'derp']
+    def pronounce(self, word_list, dialect='standard'):
+        """ Take in a normal word list and return pronunciation objects """
+        w_o_r_d_l_i_s_t = self.words2spaced(word_list)
+        """ Apply phonemes based on dialect """
+        if dialect in self.possible_dialects:
+            word_phones = \
+                self.dialect_models[dialect].translate(w_o_r_d_l_i_s_t)
+            fairseq_response = []
+            for (phones, word) in zip(word_phones, word_list):
+                fairseq_response.append({
+                    "word": word,
+                    "results": [
+                        { "pronunciation": phones }
+                    ]
+                })
+            return fairseq_response
+        else:
+            raise ValueError("There is no matching dialect g2p model.")
diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,8 @@ flask
 numpy
 gunicorn
 flask-cors
-flask-autodoc
 https://github.com/rkjaran/sequitur-g2p/archive/master.zip
+--find-links https://download.pytorch.org/whl/torch_stable.html
+torch==1.7.0+cpu
+fairseq
+requests