From 3bcc06833a31d0b0316596000b53a34a380da349 Mon Sep 17 00:00:00 2001
From: "Dheeraj R. Reddy" <dheeraj98reddy@gmail.com>
Date: Thu, 13 Jun 2019 18:08:12 +0530
Subject: [PATCH 01/52] Port CRF from tf.contrib to tfa.text

---
 tensorflow_addons/text/BUILD           |  14 +
 tensorflow_addons/text/README.md       |   1 +
 tensorflow_addons/text/__init__.py     |  12 +
 tensorflow_addons/text/crf_ops.py      | 464 +++++++++++++++++++++++++
 tensorflow_addons/text/crf_ops_test.py | 358 +++++++++++++++++++
 5 files changed, 849 insertions(+)
 create mode 100644 tensorflow_addons/text/crf_ops.py
 create mode 100644 tensorflow_addons/text/crf_ops_test.py

diff --git a/tensorflow_addons/text/BUILD b/tensorflow_addons/text/BUILD
index 4787cd8c0c..d96bdd582b 100644
--- a/tensorflow_addons/text/BUILD
+++ b/tensorflow_addons/text/BUILD
@@ -6,6 +6,7 @@ py_library(
     name = "text",
     srcs = ([
         "__init__.py",
+        "crf_ops.py",
         "skip_gram_ops.py",
     ]),
     data = [
@@ -15,6 +16,19 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_test(
+    name = "crf_ops_test",
+    size = "small",
+    srcs = [
+        "crf_ops_test.py",
+    ],
+    main = "crf_ops_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":text",
+    ],
+)
+
 py_test(
     name = "skip_gram_ops_test",
     size = "small",
diff --git a/tensorflow_addons/text/README.md b/tensorflow_addons/text/README.md
index 4b4d948363..d6e60a07b9 100644
--- a/tensorflow_addons/text/README.md
+++ b/tensorflow_addons/text/README.md
@@ -4,6 +4,7 @@
 | Submodule  |  Maintainers  | Contact Info   |
 |:---------- |:----------- |:------------- |
 | skip_gram_ops |  |  |
+| crf | Dheeraj R. Reddy | dheeraj98reddy@gmail.com |
 
 ## Components 
 | Submodule  | Text Processing Function |  Reference  |
diff --git a/tensorflow_addons/text/__init__.py b/tensorflow_addons/text/__init__.py
index 05c758e26d..6c67afa387 100644
--- a/tensorflow_addons/text/__init__.py
+++ b/tensorflow_addons/text/__init__.py
@@ -20,3 +20,15 @@
 # Skip Gram Sampling
 from tensorflow_addons.text.skip_gram_ops import skip_gram_sample
 from tensorflow_addons.text.skip_gram_ops import skip_gram_sample_with_text_vocab
+
+from tensorflow_addons.text.crf_ops import crf_binary_score
+from tensorflow_addons.text.crf_ops import crf_decode
+from tensorflow_addons.text.crf_ops import crf_log_likelihood
+from tensorflow_addons.text.crf_ops import crf_log_norm
+from tensorflow_addons.text.crf_ops import crf_multitag_sequence_score
+from tensorflow_addons.text.crf_ops import crf_sequence_score
+from tensorflow_addons.text.crf_ops import crf_unary_score
+from tensorflow_addons.text.crf_ops import CrfDecodeBackwardRnnCell
+from tensorflow_addons.text.crf_ops import CrfDecodeForwardRnnCell
+from tensorflow_addons.text.crf_ops import CrfForwardRnnCell
+from tensorflow_addons.text.crf_ops import viterbi_decode
diff --git a/tensorflow_addons/text/crf_ops.py b/tensorflow_addons/text/crf_ops.py
new file mode 100644
index 0000000000..7acd10924a
--- /dev/null
+++ b/tensorflow_addons/text/crf_ops.py
@@ -0,0 +1,464 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+def crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                       transition_params):
+  """Computes the unnormalized score for a tag sequence.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+        compute the unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
+
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of the single tag.
+  def _single_seq_fn():
+    batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
+
+    example_inds = tf.reshape(
+      tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
+    sequence_scores = tf.gather_nd(
+      tf.squeeze(inputs, [1]),
+      tf.concat([example_inds, tag_indices], axis=1))
+    sequence_scores = tf.where(tf.less_equal(sequence_lengths, 0),
+                               tf.zeros_like(sequence_scores),
+                               sequence_scores)
+    return sequence_scores
+
+  def _multi_seq_fn():
+    # Compute the scores of the given tag sequence.
+    unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
+    binary_scores = crf_binary_score(tag_indices, sequence_lengths,
+                                     transition_params)
+    sequence_scores = unary_scores + binary_scores
+    return sequence_scores
+
+  if inputs.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
+
+
+def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
+                                transition_params):
+  """Computes the unnormalized score of all tag sequences matching tag_bitmap.
+
+  tag_bitmap enables more than one tag to be considered correct at each time
+  step. This is useful when an observed output at a given time step is
+  consistent with more than one tag, and thus the log likelihood of that
+  observation must take into account all possible consistent tags.
+
+  Using one-hot vectors in tag_bitmap gives results identical to
+  crf_sequence_score.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+        representing all active tags at each index for which to calculate the
+        unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
+
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of all active tags.
+  def _single_seq_fn():
+    filtered_inputs = tf.where(
+      tag_bitmap, inputs,
+      tf.fill(tf.shape(inputs), float("-inf")))
+    return tf.reduce_logsumexp(
+      filtered_inputs, axis=[1, 2], keepdims=False)
+
+  def _multi_seq_fn():
+    # Compute the logsumexp of all scores of sequences matching the given tags.
+    filtered_inputs = tf.where(
+      tag_bitmap, inputs,
+      tf.fill(tf.shape(inputs), float("-inf")))
+    return crf_log_norm(
+      inputs=filtered_inputs,
+      sequence_lengths=sequence_lengths,
+      transition_params=transition_params)
+
+  if inputs.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
+
+
+def crf_log_norm(inputs, sequence_lengths, transition_params):
+  """Computes the normalization for a CRF.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    log_norm: A [batch_size] vector of normalizers for a CRF.
+  """
+  # Split up the first and rest of the inputs in preparation for the forward
+  # algorithm.
+  first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
+  first_input = tf.squeeze(first_input, [1])
+
+  # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
+  # the "initial state" (the unary potentials).
+  def _single_seq_fn():
+    log_norm = tf.reduce_logsumexp(first_input, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
+                        tf.zeros_like(log_norm),
+                        log_norm)
+    return log_norm
+
+  def _multi_seq_fn():
+    """Forward computation of alpha values."""
+    rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
+    # Compute the alpha values in the forward algorithm in order to get the
+    # partition function.
+    forward_cell = CrfForwardRnnCell(transition_params)
+    # Sequence length is not allowed to be less than zero.
+    sequence_lengths_less_one = tf.maximum(
+      tf.constant(0, dtype=sequence_lengths.dtype),
+      sequence_lengths - 1)
+
+    forward_layer = tf.keras.layers.RNN(
+      forward_cell,
+      return_sequences=True,
+      return_state=True)
+
+    _, alphas = forward_layer(rest_of_input, first_input)
+
+    log_norm = tf.reduce_logsumexp(alphas, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
+                        tf.zeros_like(log_norm),
+                        log_norm)
+    return log_norm
+
+  if inputs.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
+
+
+def crf_log_likelihood(inputs,
+                       tag_indices,
+                       sequence_lengths,
+                       transition_params=None):
+  """Computes the log-likelihood of tag sequences in a CRF.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+        compute the log-likelihood.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix, if available.
+  Returns:
+    log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
+      each example, given the sequence of tag indices.
+    transition_params: A [num_tags, num_tags] transition matrix. This is either
+        provided by the caller or created in this function.
+  """
+  # Get shape information.
+  num_tags = inputs.shape[2]
+
+  # Get the transition matrix if not provided.
+  if transition_params is None:
+    transition_params = tf.get_variable("transitions", [num_tags, num_tags])
+
+  sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                                       transition_params)
+  log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
+
+  # Normalize the scores to get the log-likelihood per example.
+  log_likelihood = sequence_scores - log_norm
+  return log_likelihood, transition_params
+
+
+def crf_unary_score(tag_indices, sequence_lengths, inputs):
+  """Computes the unary scores of tag sequences.
+
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+  Returns:
+    unary_scores: A [batch_size] vector of unary scores.
+  """
+  batch_size = tf.shape(inputs)[0]
+  max_seq_len = tf.shape(inputs)[1]
+  num_tags = tf.shape(inputs)[2]
+
+  flattened_inputs = tf.reshape(inputs, [-1])
+
+  offsets = tf.expand_dims(
+    tf.range(batch_size) * max_seq_len * num_tags, 1)
+  offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
+  # Use int32 or int64 based on tag_indices' dtype.
+  if tag_indices.dtype == tf.int64:
+    offsets = tf.cast(offsets, tf.int64)
+  flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])
+
+  unary_scores = tf.reshape(
+    tf.gather(flattened_inputs, flattened_tag_indices),
+    [batch_size, max_seq_len])
+
+  masks = tf.sequence_mask(sequence_lengths,
+                           maxlen=tf.shape(tag_indices)[1],
+                           dtype=tf.float32)
+
+  unary_scores = tf.reduce_sum(unary_scores * masks, 1)
+  return unary_scores
+
+
+def crf_binary_score(tag_indices, sequence_lengths, transition_params):
+  """Computes the binary scores of tag sequences.
+
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+  Returns:
+    binary_scores: A [batch_size] vector of binary scores.
+  """
+  # Get shape information.
+  num_tags = tf.shape(transition_params)[0]
+  num_transitions = tf.shape(tag_indices)[1] - 1
+
+  # Truncate by one on each side of the sequence to get the start and end
+  # indices of each transition.
+  start_tag_indices = tf.slice(tag_indices, [0, 0],
+                               [-1, num_transitions])
+  end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])
+
+  # Encode the indices in a flattened representation.
+  flattened_transition_indices = start_tag_indices * num_tags + end_tag_indices
+  flattened_transition_params = tf.reshape(transition_params, [-1])
+
+  # Get the binary scores based on the flattened representation.
+  binary_scores = tf.gather(flattened_transition_params,
+                            flattened_transition_indices)
+
+  masks = tf.sequence_mask(sequence_lengths,
+                           maxlen=tf.shape(tag_indices)[1],
+                           dtype=tf.float32)
+  truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
+  binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
+  return binary_scores
+
+
+class CrfForwardRnnCell(tf.keras.layers.Layer):
+  def __init__(self, transition_params, **kwargs):
+    super(CrfForwardRnnCell, self).__init__(**kwargs)
+    self._transition_params = tf.expand_dims(transition_params, 0)
+    self._num_tags = transition_params.shape[0]
+    self.state_size = self._num_tags
+    self.output_size = self._num_tags
+
+  def build(self, input_shape):
+    super(CrfForwardRnnCell, self).build(input_shape)
+
+  def call(self, inputs, state, training=None):
+    state = tf.expand_dims(state[0], 2)
+    transition_scores = state + self._transition_params
+    new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
+    return new_alphas, new_alphas
+
+
+def viterbi_decode(score, transition_params):
+  """Decode the highest scoring sequence of tags outside of TensorFlow.
+
+  This should only be used at test time.
+
+  Args:
+    score: A [seq_len, num_tags] matrix of unary potentials.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+
+  Returns:
+    viterbi: A [seq_len] list of integers containing the highest scoring tag
+        indices.
+    viterbi_score: A float containing the score for the Viterbi sequence.
+  """
+  trellis = np.zeros_like(score)
+  backpointers = np.zeros_like(score, dtype=np.int32)
+  trellis[0] = score[0]
+
+  for t in range(1, score.shape[0]):
+    v = np.expand_dims(trellis[t - 1], 1) + transition_params
+    trellis[t] = score[t] + np.max(v, 0)
+    backpointers[t] = np.argmax(v, 0)
+
+  viterbi = [np.argmax(trellis[-1])]
+  for bp in reversed(backpointers[1:]):
+    viterbi.append(bp[viterbi[-1]])
+  viterbi.reverse()
+
+  viterbi_score = np.max(trellis[-1])
+  return viterbi, viterbi_score
+
+
+class CrfDecodeForwardRnnCell(tf.keras.layers.Layer):
+  """Computes the forward decoding in a linear-chain CRF.
+  """
+
+  def __init__(self, transition_params, **kwargs):
+    """Initialize the CrfDecodeForwardRnnCell.
+
+    Args:
+      transition_params: A [num_tags, num_tags] matrix of binary
+        potentials. This matrix is expanded into a
+        [1, num_tags, num_tags] in preparation for the broadcast
+        summation occurring within the cell.
+    """
+    super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
+    self._transition_params = tf.expand_dims(transition_params, 0)
+    self._num_tags = transition_params.shape[0]
+    self.state_size = self._num_tags
+    self.output_size = self._num_tags
+
+  def build(self, input_shape):
+    super(CrfDecodeForwardRnnCell, self).build(input_shape)
+
+  def call(self, inputs, state, training=None):
+    state = tf.expand_dims(state[0], 2)
+    transition_scores = state + self._transition_params
+    new_state = inputs + tf.reduce_max(transition_scores, [1])
+    backpointers = tf.argmax(transition_scores, 1)
+    backpointers = tf.cast(backpointers, dtype=tf.int32)
+    return backpointers, new_state
+
+
+class CrfDecodeBackwardRnnCell(tf.keras.layers.Layer):
+  """Computes backward decoding in a linear-chain CRF.
+  """
+
+  def __init__(self, num_tags, **kwargs):
+    """Initialize the CrfDecodeBackwardRnnCell.
+
+    Args:
+      num_tags: An integer. The number of tags.
+    """
+    super(CrfDecodeBackwardRnnCell, self).__init__(**kwargs)
+    self._num_tags = num_tags
+
+    self.state_size = 1
+    self.output_size = 1
+
+  def build(self, input_shape):
+    super(CrfDecodeBackwardRnnCell, self).build(input_shape)
+
+  def call(self, inputs, state, training=None):
+    state = tf.squeeze(state[0], axis=[1])
+    batch_size = tf.shape(inputs)[0]
+    b_indices = tf.range(batch_size)
+    indices = tf.stack([b_indices, state], axis=1)
+    new_tags = tf.expand_dims(tf.gather_nd(inputs, indices), axis=-1)
+
+    return new_tags, new_tags
+
+
+def crf_decode(potentials, transition_params, sequence_length):
+  """Decode the highest scoring sequence of tags in TensorFlow.
+
+  This is a function for tensor.
+
+  Args:
+    potentials: A [batch_size, max_seq_len, num_tags] tensor of
+              unary potentials.
+    transition_params: A [num_tags, num_tags] matrix of
+              binary potentials.
+    sequence_length: A [batch_size] vector of true sequence lengths.
+
+  Returns:
+    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
+                Contains the highest scoring tag indices.
+    best_score: A [batch_size] vector, containing the score of `decode_tags`.
+  """
+
+  # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
+  # and the max activation.
+  def _single_seq_fn():
+    squeezed_potentials = tf.squeeze(potentials, [1])
+    decode_tags = tf.expand_dims(
+      tf.argmax(squeezed_potentials, axis=1), 1)
+    best_score = tf.reduce_max(squeezed_potentials, axis=1)
+    return tf.cast(decode_tags, dtype=tf.int32), best_score
+
+  def _multi_seq_fn():
+    """Decoding of highest scoring sequence."""
+
+    # For simplicity, in shape comments, denote:
+    # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
+    num_tags = potentials.shape[2]
+
+    # Computes forward decoding. Get last score and backpointers.
+    initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1])
+    initial_state = tf.squeeze(initial_state, axis=[1])  # [B, O]
+    inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    # Sequence length is not allowed to be less than zero.
+
+    sequence_length_less_one = tf.maximum(
+      tf.constant(0, dtype=sequence_length.dtype),
+      sequence_length - 1)
+
+    crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
+    crf_fwd_layer = tf.keras.layers.RNN(crf_fwd_cell,
+                                        return_sequences=True,
+                                        return_state=True,
+                                        time_major=False)
+    backpointers, last_score = crf_fwd_layer(inputs, initial_state)
+    backpointers = tf.reverse_sequence(backpointers, sequence_length_less_one, seq_axis=1)
+
+    crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
+    initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32)
+    initial_state = tf.expand_dims(initial_state, axis=-1)
+    crf_bwd_layer = tf.keras.layers.RNN(crf_bwd_cell,
+                                        return_sequences=True,
+                                        return_state=True,
+                                        time_major=False)
+    decode_tags, _ = crf_bwd_layer(backpointers, initial_state)
+
+    decode_tags = tf.squeeze(decode_tags, axis=[2])  # [B, T - 1]
+    decode_tags = tf.concat([initial_state, decode_tags],  # [B, T]
+                            axis=1)
+    decode_tags = tf.reverse_sequence(  # [B, T]
+      decode_tags, sequence_length, seq_axis=1)
+
+    best_score = tf.reduce_max(last_score, axis=1)  # [B]
+    return decode_tags, best_score
+
+  if potentials.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
diff --git a/tensorflow_addons/text/crf_ops_test.py b/tensorflow_addons/text/crf_ops_test.py
new file mode 100644
index 0000000000..d706992c32
--- /dev/null
+++ b/tensorflow_addons/text/crf_ops_test.py
@@ -0,0 +1,358 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CRF."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow_addons import text
+from tensorflow_addons.utils import test_utils
+
+
+class CrfTest(tf.test.TestCase):
+
+  def calculateSequenceScore(self, inputs, transition_params, tag_indices,
+                             sequence_lengths):
+    expected_unary_score = sum(
+      inputs[i][tag_indices[i]] for i in range(sequence_lengths))
+    expected_binary_score = sum(
+      transition_params[tag_indices[i], tag_indices[i + 1]]
+      for i in range(sequence_lengths - 1))
+    return expected_unary_score + expected_binary_score
+
+  def testCrfSequenceScore(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[4, 5, -3]],
+               dtype=np.float32),
+    ]
+    tag_indices_list = [
+      np.array([1, 2, 1, 0], dtype=np.int32),
+      np.array([1], dtype=np.int32)
+    ]
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      sequence_score = text.crf_sequence_score(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+
+      tf_sequence_score = self.evaluate(sequence_score)
+
+      expected_sequence_score = self.calculateSequenceScore(
+        inputs, transition_params, tag_indices, sequence_lengths)
+      self.assertAllClose(tf_sequence_score, expected_sequence_score)
+
+  def testCrfMultiTagSequenceScore(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[4, 5, -3]],
+               dtype=np.float32),
+    ]
+    tag_bitmap_list = [
+      np.array(
+        [[True, True, False], [True, False, True], [False, True, True],
+         [True, False, True]],
+        dtype=np.bool),
+      np.array([[True, True, False]], dtype=np.bool)
+    ]
+    for sequence_lengths, inputs, tag_bitmap in zip(
+            sequence_lengths_list, inputs_list, tag_bitmap_list):
+      sequence_score = text.crf_multitag_sequence_score(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_bitmap=tf.expand_dims(tag_bitmap, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+      tf_sum_sequence_score = self.evaluate(sequence_score)
+      all_indices_list = [
+        single_index_bitmap.nonzero()[0]
+        for single_index_bitmap in tag_bitmap[:sequence_lengths]
+      ]
+      expected_sequence_scores = [
+        self.calculateSequenceScore(inputs, transition_params, indices,
+                                    sequence_lengths)
+        for indices in itertools.product(*all_indices_list)
+      ]
+      expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(
+        expected_sequence_scores)
+      self.assertAllClose(tf_sum_sequence_score,
+                          expected_log_sum_exp_sequence_scores)
+
+  def testCrfUnaryScore(self):
+    inputs = np.array(
+      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    for dtype in (np.int32, np.int64):
+      tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
+      sequence_lengths = np.array(3, dtype=np.int32)
+      unary_score = text.crf_unary_score(
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        inputs=tf.expand_dims(inputs, 0))
+      unary_score = tf.squeeze(unary_score, [0])
+      tf_unary_score = self.evaluate(unary_score)
+      expected_unary_score = sum(inputs[i][tag_indices[i]]
+                                 for i in range(sequence_lengths))
+      self.assertAllClose(tf_unary_score, expected_unary_score)
+
+  def testCrfBinaryScore(self):
+    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    binary_score = text.crf_binary_score(
+      tag_indices=tf.expand_dims(tag_indices, 0),
+      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+      transition_params=tf.constant(transition_params))
+    binary_score = tf.squeeze(binary_score, [0])
+    tf_binary_score = self.evaluate(binary_score)
+    expected_binary_score = sum(
+      transition_params[tag_indices[i], tag_indices[i + 1]]
+      for i in range(sequence_lengths - 1))
+    self.assertAllClose(tf_binary_score, expected_binary_score)
+
+  def testCrfLogNorm(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int64)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[3, -1, 3]],
+               dtype=np.float32),
+    ]
+    tag_indices_list = [
+      np.array([1, 2, 1, 0], dtype=np.int32),
+      np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
+      all_sequence_scores = []
+
+      # Compare the dynamic program with brute force computation.
+      for tag_indices in itertools.product(
+              range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequence_scores.append(
+          text.crf_sequence_score(
+            inputs=tf.expand_dims(inputs, 0),
+            tag_indices=tf.expand_dims(tag_indices, 0),
+            sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+            transition_params=tf.constant(transition_params)))
+
+      brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
+      log_norm = text.crf_log_norm(
+        inputs=tf.expand_dims(inputs, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      log_norm = tf.squeeze(log_norm, [0])
+      tf_brute_force_log_norm, tf_log_norm = self.evaluate(
+        [brute_force_log_norm, log_norm])
+
+      self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
+
+  def testCrfLogNormZeroSeqLength(self):
+    """
+    Test `crf_log_norm` when `sequence_lengths` contains one or more zeros.
+    """
+    inputs = tf.constant(np.ones([2, 10, 5],
+                                 dtype=np.float32))
+    transition_params = tf.constant(np.ones([5, 5],
+                                            dtype=np.float32))
+    sequence_lengths = tf.constant(np.zeros([2],
+                                            dtype=np.int32))
+    expected_log_norm = np.zeros([2], dtype=np.float32)
+    log_norm = text.crf_log_norm(inputs, sequence_lengths, transition_params)
+    tf_log_norm = self.evaluate(log_norm)
+    self.assertAllClose(tf_log_norm, expected_log_norm)
+
+  def testCrfLogLikelihood(self):
+    inputs = np.array(
+      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+    all_sequence_log_likelihoods = []
+
+    # Make sure all probabilities sum to 1.
+    for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+      tag_indices = list(tag_indices)
+      tag_indices.extend([0] * (num_words - sequence_lengths))
+      sequence_log_likelihood, _ = text.crf_log_likelihood(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      all_sequence_log_likelihoods.append(sequence_log_likelihood)
+    total_log_likelihood = tf.reduce_logsumexp(
+      all_sequence_log_likelihoods)
+    tf_total_log_likelihood = self.evaluate(total_log_likelihood)
+    self.assertAllClose(tf_total_log_likelihood, 0.0)
+
+  def testViterbiDecode(self):
+    inputs = np.array(
+      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+
+    all_sequence_scores = []
+    all_sequences = []
+
+    # Compare the dynamic program with brute force computation.
+    for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+      tag_indices = list(tag_indices)
+      tag_indices.extend([0] * (num_words - sequence_lengths))
+      all_sequences.append(tag_indices)
+      sequence_score = text.crf_sequence_score(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+      all_sequence_scores.append(sequence_score)
+
+    tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+    expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+    expected_max_sequence = all_sequences[expected_max_sequence_index]
+    expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+    actual_max_sequence, actual_max_score = text.viterbi_decode(
+      inputs[:sequence_lengths], transition_params)
+
+    self.assertAllClose(actual_max_score, expected_max_score)
+    self.assertEqual(actual_max_sequence,
+                     expected_max_sequence[:sequence_lengths])
+
+  def testCrfDecode(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int64)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[-1, 2, 1]],
+               dtype=np.float32),
+    ]
+    tag_indices_list = [
+      np.array([1, 2, 1, 0], dtype=np.int32),
+      np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
+
+      all_sequence_scores = []
+      all_sequences = []
+
+      # Compare the dynamic program with brute force computation.
+      for tag_indices in itertools.product(
+              range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequences.append(tag_indices)
+        sequence_score = text.crf_sequence_score(
+          inputs=tf.expand_dims(inputs, 0),
+          tag_indices=tf.expand_dims(tag_indices, 0),
+          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+          transition_params=tf.constant(transition_params))
+        sequence_score = tf.squeeze(sequence_score, [0])
+        all_sequence_scores.append(sequence_score)
+
+      tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+      expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+      expected_max_sequence = all_sequences[expected_max_sequence_index]
+      expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+      actual_max_sequence, actual_max_score = text.crf_decode(
+        tf.expand_dims(inputs, 0),
+        tf.constant(transition_params),
+        tf.expand_dims(sequence_lengths, 0))
+      actual_max_sequence = tf.squeeze(actual_max_sequence, [0])
+      actual_max_score = tf.squeeze(actual_max_score, [0])
+      tf_actual_max_sequence, tf_actual_max_score = self.evaluate(
+        [actual_max_sequence, actual_max_score])
+
+      self.assertAllClose(tf_actual_max_score, expected_max_score)
+      self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
+                       expected_max_sequence[:sequence_lengths])
+
+  def testCrfDecodeZeroSeqLength(self):
+    """
+    Test that crf_decode works when sequence_length contains one or more zeros.
+    """
+    inputs = tf.constant(np.ones([2, 10, 5],
+                                 dtype=np.float32))
+    transition_params = tf.constant(np.ones([5, 5],
+                                            dtype=np.float32))
+    sequence_lengths = tf.constant(np.zeros([2],
+                                            dtype=np.int32))
+    tags, scores = text.crf_decode(inputs, transition_params, sequence_lengths)
+    tf_tags, tf_scores = self.evaluate([tags, scores])
+    self.assertEqual(len(tf_tags.shape), 2)
+    self.assertEqual(len(tf_scores.shape), 1)
+
+
+if __name__ == "__main__":
+  tf.test.main()

From b24ee9c422f8632ac7928c8008b1cc886f7a4bfb Mon Sep 17 00:00:00 2001
From: "Dheeraj R. Reddy" <dheeraj98reddy@gmail.com>
Date: Thu, 13 Jun 2019 18:08:12 +0530
Subject: [PATCH 02/52] Port CRF from tf.contrib to tfa.text

---
 tensorflow_addons/text/BUILD           |  14 +
 tensorflow_addons/text/README.md       |   1 +
 tensorflow_addons/text/__init__.py     |  12 +
 tensorflow_addons/text/crf_ops.py      | 464 +++++++++++++++++++++++++
 tensorflow_addons/text/crf_ops_test.py | 358 +++++++++++++++++++
 5 files changed, 849 insertions(+)
 create mode 100644 tensorflow_addons/text/crf_ops.py
 create mode 100644 tensorflow_addons/text/crf_ops_test.py

diff --git a/tensorflow_addons/text/BUILD b/tensorflow_addons/text/BUILD
index 4787cd8c0c..d96bdd582b 100644
--- a/tensorflow_addons/text/BUILD
+++ b/tensorflow_addons/text/BUILD
@@ -6,6 +6,7 @@ py_library(
     name = "text",
     srcs = ([
         "__init__.py",
+        "crf_ops.py",
         "skip_gram_ops.py",
     ]),
     data = [
@@ -15,6 +16,19 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_test(
+    name = "crf_ops_test",
+    size = "small",
+    srcs = [
+        "crf_ops_test.py",
+    ],
+    main = "crf_ops_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":text",
+    ],
+)
+
 py_test(
     name = "skip_gram_ops_test",
     size = "small",
diff --git a/tensorflow_addons/text/README.md b/tensorflow_addons/text/README.md
index 4b4d948363..d6e60a07b9 100644
--- a/tensorflow_addons/text/README.md
+++ b/tensorflow_addons/text/README.md
@@ -4,6 +4,7 @@
 | Submodule  |  Maintainers  | Contact Info   |
 |:---------- |:----------- |:------------- |
 | skip_gram_ops |  |  |
+| crf | Dheeraj R. Reddy | dheeraj98reddy@gmail.com |
 
 ## Components 
 | Submodule  | Text Processing Function |  Reference  |
diff --git a/tensorflow_addons/text/__init__.py b/tensorflow_addons/text/__init__.py
index 05c758e26d..6c67afa387 100644
--- a/tensorflow_addons/text/__init__.py
+++ b/tensorflow_addons/text/__init__.py
@@ -20,3 +20,15 @@
 # Skip Gram Sampling
 from tensorflow_addons.text.skip_gram_ops import skip_gram_sample
 from tensorflow_addons.text.skip_gram_ops import skip_gram_sample_with_text_vocab
+
+from tensorflow_addons.text.crf_ops import crf_binary_score
+from tensorflow_addons.text.crf_ops import crf_decode
+from tensorflow_addons.text.crf_ops import crf_log_likelihood
+from tensorflow_addons.text.crf_ops import crf_log_norm
+from tensorflow_addons.text.crf_ops import crf_multitag_sequence_score
+from tensorflow_addons.text.crf_ops import crf_sequence_score
+from tensorflow_addons.text.crf_ops import crf_unary_score
+from tensorflow_addons.text.crf_ops import CrfDecodeBackwardRnnCell
+from tensorflow_addons.text.crf_ops import CrfDecodeForwardRnnCell
+from tensorflow_addons.text.crf_ops import CrfForwardRnnCell
+from tensorflow_addons.text.crf_ops import viterbi_decode
diff --git a/tensorflow_addons/text/crf_ops.py b/tensorflow_addons/text/crf_ops.py
new file mode 100644
index 0000000000..7acd10924a
--- /dev/null
+++ b/tensorflow_addons/text/crf_ops.py
@@ -0,0 +1,464 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+def crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                       transition_params):
+  """Computes the unnormalized score for a tag sequence.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+        compute the unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
+
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of the single tag.
+  def _single_seq_fn():
+    batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
+
+    example_inds = tf.reshape(
+      tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
+    sequence_scores = tf.gather_nd(
+      tf.squeeze(inputs, [1]),
+      tf.concat([example_inds, tag_indices], axis=1))
+    sequence_scores = tf.where(tf.less_equal(sequence_lengths, 0),
+                               tf.zeros_like(sequence_scores),
+                               sequence_scores)
+    return sequence_scores
+
+  def _multi_seq_fn():
+    # Compute the scores of the given tag sequence.
+    unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
+    binary_scores = crf_binary_score(tag_indices, sequence_lengths,
+                                     transition_params)
+    sequence_scores = unary_scores + binary_scores
+    return sequence_scores
+
+  if inputs.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
+
+
+def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
+                                transition_params):
+  """Computes the unnormalized score of all tag sequences matching tag_bitmap.
+
+  tag_bitmap enables more than one tag to be considered correct at each time
+  step. This is useful when an observed output at a given time step is
+  consistent with more than one tag, and thus the log likelihood of that
+  observation must take into account all possible consistent tags.
+
+  Using one-hot vectors in tag_bitmap gives results identical to
+  crf_sequence_score.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+        representing all active tags at each index for which to calculate the
+        unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
+
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of all active tags.
+  def _single_seq_fn():
+    filtered_inputs = tf.where(
+      tag_bitmap, inputs,
+      tf.fill(tf.shape(inputs), float("-inf")))
+    return tf.reduce_logsumexp(
+      filtered_inputs, axis=[1, 2], keepdims=False)
+
+  def _multi_seq_fn():
+    # Compute the logsumexp of all scores of sequences matching the given tags.
+    filtered_inputs = tf.where(
+      tag_bitmap, inputs,
+      tf.fill(tf.shape(inputs), float("-inf")))
+    return crf_log_norm(
+      inputs=filtered_inputs,
+      sequence_lengths=sequence_lengths,
+      transition_params=transition_params)
+
+  if inputs.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
+
+
+def crf_log_norm(inputs, sequence_lengths, transition_params):
+  """Computes the normalization for a CRF.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    log_norm: A [batch_size] vector of normalizers for a CRF.
+  """
+  # Split up the first and rest of the inputs in preparation for the forward
+  # algorithm.
+  first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
+  first_input = tf.squeeze(first_input, [1])
+
+  # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
+  # the "initial state" (the unary potentials).
+  def _single_seq_fn():
+    log_norm = tf.reduce_logsumexp(first_input, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
+                        tf.zeros_like(log_norm),
+                        log_norm)
+    return log_norm
+
+  def _multi_seq_fn():
+    """Forward computation of alpha values."""
+    rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
+    # Compute the alpha values in the forward algorithm in order to get the
+    # partition function.
+    forward_cell = CrfForwardRnnCell(transition_params)
+    # Sequence length is not allowed to be less than zero.
+    sequence_lengths_less_one = tf.maximum(
+      tf.constant(0, dtype=sequence_lengths.dtype),
+      sequence_lengths - 1)
+
+    forward_layer = tf.keras.layers.RNN(
+      forward_cell,
+      return_sequences=True,
+      return_state=True)
+
+    _, alphas = forward_layer(rest_of_input, first_input)
+
+    log_norm = tf.reduce_logsumexp(alphas, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
+                        tf.zeros_like(log_norm),
+                        log_norm)
+    return log_norm
+
+  if inputs.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
+
+
+def crf_log_likelihood(inputs,
+                       tag_indices,
+                       sequence_lengths,
+                       transition_params=None):
+  """Computes the log-likelihood of tag sequences in a CRF.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+        compute the log-likelihood.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix, if available.
+  Returns:
+    log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
+      each example, given the sequence of tag indices.
+    transition_params: A [num_tags, num_tags] transition matrix. This is either
+        provided by the caller or created in this function.
+  """
+  # Get shape information.
+  num_tags = inputs.shape[2]
+
+  # Get the transition matrix if not provided.
+  if transition_params is None:
+    transition_params = tf.get_variable("transitions", [num_tags, num_tags])
+
+  sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                                       transition_params)
+  log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
+
+  # Normalize the scores to get the log-likelihood per example.
+  log_likelihood = sequence_scores - log_norm
+  return log_likelihood, transition_params
+
+
+def crf_unary_score(tag_indices, sequence_lengths, inputs):
+  """Computes the unary scores of tag sequences.
+
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+  Returns:
+    unary_scores: A [batch_size] vector of unary scores.
+  """
+  batch_size = tf.shape(inputs)[0]
+  max_seq_len = tf.shape(inputs)[1]
+  num_tags = tf.shape(inputs)[2]
+
+  flattened_inputs = tf.reshape(inputs, [-1])
+
+  offsets = tf.expand_dims(
+    tf.range(batch_size) * max_seq_len * num_tags, 1)
+  offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
+  # Use int32 or int64 based on tag_indices' dtype.
+  if tag_indices.dtype == tf.int64:
+    offsets = tf.cast(offsets, tf.int64)
+  flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])
+
+  unary_scores = tf.reshape(
+    tf.gather(flattened_inputs, flattened_tag_indices),
+    [batch_size, max_seq_len])
+
+  masks = tf.sequence_mask(sequence_lengths,
+                           maxlen=tf.shape(tag_indices)[1],
+                           dtype=tf.float32)
+
+  unary_scores = tf.reduce_sum(unary_scores * masks, 1)
+  return unary_scores
+
+
+def crf_binary_score(tag_indices, sequence_lengths, transition_params):
+  """Computes the binary scores of tag sequences.
+
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+  Returns:
+    binary_scores: A [batch_size] vector of binary scores.
+  """
+  # Get shape information.
+  num_tags = tf.shape(transition_params)[0]
+  num_transitions = tf.shape(tag_indices)[1] - 1
+
+  # Truncate by one on each side of the sequence to get the start and end
+  # indices of each transition.
+  start_tag_indices = tf.slice(tag_indices, [0, 0],
+                               [-1, num_transitions])
+  end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])
+
+  # Encode the indices in a flattened representation.
+  flattened_transition_indices = start_tag_indices * num_tags + end_tag_indices
+  flattened_transition_params = tf.reshape(transition_params, [-1])
+
+  # Get the binary scores based on the flattened representation.
+  binary_scores = tf.gather(flattened_transition_params,
+                            flattened_transition_indices)
+
+  masks = tf.sequence_mask(sequence_lengths,
+                           maxlen=tf.shape(tag_indices)[1],
+                           dtype=tf.float32)
+  truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
+  binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
+  return binary_scores
+
+
+class CrfForwardRnnCell(tf.keras.layers.Layer):
+  def __init__(self, transition_params, **kwargs):
+    super(CrfForwardRnnCell, self).__init__(**kwargs)
+    self._transition_params = tf.expand_dims(transition_params, 0)
+    self._num_tags = transition_params.shape[0]
+    self.state_size = self._num_tags
+    self.output_size = self._num_tags
+
+  def build(self, input_shape):
+    super(CrfForwardRnnCell, self).build(input_shape)
+
+  def call(self, inputs, state, training=None):
+    state = tf.expand_dims(state[0], 2)
+    transition_scores = state + self._transition_params
+    new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
+    return new_alphas, new_alphas
+
+
+def viterbi_decode(score, transition_params):
+  """Decode the highest scoring sequence of tags outside of TensorFlow.
+
+  This should only be used at test time.
+
+  Args:
+    score: A [seq_len, num_tags] matrix of unary potentials.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+
+  Returns:
+    viterbi: A [seq_len] list of integers containing the highest scoring tag
+        indices.
+    viterbi_score: A float containing the score for the Viterbi sequence.
+  """
+  trellis = np.zeros_like(score)
+  backpointers = np.zeros_like(score, dtype=np.int32)
+  trellis[0] = score[0]
+
+  for t in range(1, score.shape[0]):
+    v = np.expand_dims(trellis[t - 1], 1) + transition_params
+    trellis[t] = score[t] + np.max(v, 0)
+    backpointers[t] = np.argmax(v, 0)
+
+  viterbi = [np.argmax(trellis[-1])]
+  for bp in reversed(backpointers[1:]):
+    viterbi.append(bp[viterbi[-1]])
+  viterbi.reverse()
+
+  viterbi_score = np.max(trellis[-1])
+  return viterbi, viterbi_score
+
+
+class CrfDecodeForwardRnnCell(tf.keras.layers.Layer):
+  """Computes the forward decoding in a linear-chain CRF.
+  """
+
+  def __init__(self, transition_params, **kwargs):
+    """Initialize the CrfDecodeForwardRnnCell.
+
+    Args:
+      transition_params: A [num_tags, num_tags] matrix of binary
+        potentials. This matrix is expanded into a
+        [1, num_tags, num_tags] in preparation for the broadcast
+        summation occurring within the cell.
+    """
+    super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
+    self._transition_params = tf.expand_dims(transition_params, 0)
+    self._num_tags = transition_params.shape[0]
+    self.state_size = self._num_tags
+    self.output_size = self._num_tags
+
+  def build(self, input_shape):
+    super(CrfDecodeForwardRnnCell, self).build(input_shape)
+
+  def call(self, inputs, state, training=None):
+    state = tf.expand_dims(state[0], 2)
+    transition_scores = state + self._transition_params
+    new_state = inputs + tf.reduce_max(transition_scores, [1])
+    backpointers = tf.argmax(transition_scores, 1)
+    backpointers = tf.cast(backpointers, dtype=tf.int32)
+    return backpointers, new_state
+
+
+class CrfDecodeBackwardRnnCell(tf.keras.layers.Layer):
+  """Computes backward decoding in a linear-chain CRF.
+  """
+
+  def __init__(self, num_tags, **kwargs):
+    """Initialize the CrfDecodeBackwardRnnCell.
+
+    Args:
+      num_tags: An integer. The number of tags.
+    """
+    super(CrfDecodeBackwardRnnCell, self).__init__(**kwargs)
+    self._num_tags = num_tags
+
+    self.state_size = 1
+    self.output_size = 1
+
+  def build(self, input_shape):
+    super(CrfDecodeBackwardRnnCell, self).build(input_shape)
+
+  def call(self, inputs, state, training=None):
+    state = tf.squeeze(state[0], axis=[1])
+    batch_size = tf.shape(inputs)[0]
+    b_indices = tf.range(batch_size)
+    indices = tf.stack([b_indices, state], axis=1)
+    new_tags = tf.expand_dims(tf.gather_nd(inputs, indices), axis=-1)
+
+    return new_tags, new_tags
+
+
+def crf_decode(potentials, transition_params, sequence_length):
+  """Decode the highest scoring sequence of tags in TensorFlow.
+
+  This is a function for tensor.
+
+  Args:
+    potentials: A [batch_size, max_seq_len, num_tags] tensor of
+              unary potentials.
+    transition_params: A [num_tags, num_tags] matrix of
+              binary potentials.
+    sequence_length: A [batch_size] vector of true sequence lengths.
+
+  Returns:
+    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
+                Contains the highest scoring tag indices.
+    best_score: A [batch_size] vector, containing the score of `decode_tags`.
+  """
+
+  # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
+  # and the max activation.
+  def _single_seq_fn():
+    squeezed_potentials = tf.squeeze(potentials, [1])
+    decode_tags = tf.expand_dims(
+      tf.argmax(squeezed_potentials, axis=1), 1)
+    best_score = tf.reduce_max(squeezed_potentials, axis=1)
+    return tf.cast(decode_tags, dtype=tf.int32), best_score
+
+  def _multi_seq_fn():
+    """Decoding of highest scoring sequence."""
+
+    # For simplicity, in shape comments, denote:
+    # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
+    num_tags = potentials.shape[2]
+
+    # Computes forward decoding. Get last score and backpointers.
+    initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1])
+    initial_state = tf.squeeze(initial_state, axis=[1])  # [B, O]
+    inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    # Sequence length is not allowed to be less than zero.
+
+    sequence_length_less_one = tf.maximum(
+      tf.constant(0, dtype=sequence_length.dtype),
+      sequence_length - 1)
+
+    crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
+    crf_fwd_layer = tf.keras.layers.RNN(crf_fwd_cell,
+                                        return_sequences=True,
+                                        return_state=True,
+                                        time_major=False)
+    backpointers, last_score = crf_fwd_layer(inputs, initial_state)
+    backpointers = tf.reverse_sequence(backpointers, sequence_length_less_one, seq_axis=1)
+
+    crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
+    initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32)
+    initial_state = tf.expand_dims(initial_state, axis=-1)
+    crf_bwd_layer = tf.keras.layers.RNN(crf_bwd_cell,
+                                        return_sequences=True,
+                                        return_state=True,
+                                        time_major=False)
+    decode_tags, _ = crf_bwd_layer(backpointers, initial_state)
+
+    decode_tags = tf.squeeze(decode_tags, axis=[2])  # [B, T - 1]
+    decode_tags = tf.concat([initial_state, decode_tags],  # [B, T]
+                            axis=1)
+    decode_tags = tf.reverse_sequence(  # [B, T]
+      decode_tags, sequence_length, seq_axis=1)
+
+    best_score = tf.reduce_max(last_score, axis=1)  # [B]
+    return decode_tags, best_score
+
+  if potentials.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
diff --git a/tensorflow_addons/text/crf_ops_test.py b/tensorflow_addons/text/crf_ops_test.py
new file mode 100644
index 0000000000..d706992c32
--- /dev/null
+++ b/tensorflow_addons/text/crf_ops_test.py
@@ -0,0 +1,358 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CRF."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow_addons import text
+from tensorflow_addons.utils import test_utils
+
+
+class CrfTest(tf.test.TestCase):
+
+  def calculateSequenceScore(self, inputs, transition_params, tag_indices,
+                             sequence_lengths):
+    expected_unary_score = sum(
+      inputs[i][tag_indices[i]] for i in range(sequence_lengths))
+    expected_binary_score = sum(
+      transition_params[tag_indices[i], tag_indices[i + 1]]
+      for i in range(sequence_lengths - 1))
+    return expected_unary_score + expected_binary_score
+
+  def testCrfSequenceScore(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[4, 5, -3]],
+               dtype=np.float32),
+    ]
+    tag_indices_list = [
+      np.array([1, 2, 1, 0], dtype=np.int32),
+      np.array([1], dtype=np.int32)
+    ]
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      sequence_score = text.crf_sequence_score(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+
+      tf_sequence_score = self.evaluate(sequence_score)
+
+      expected_sequence_score = self.calculateSequenceScore(
+        inputs, transition_params, tag_indices, sequence_lengths)
+      self.assertAllClose(tf_sequence_score, expected_sequence_score)
+
+  def testCrfMultiTagSequenceScore(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[4, 5, -3]],
+               dtype=np.float32),
+    ]
+    tag_bitmap_list = [
+      np.array(
+        [[True, True, False], [True, False, True], [False, True, True],
+         [True, False, True]],
+        dtype=np.bool),
+      np.array([[True, True, False]], dtype=np.bool)
+    ]
+    for sequence_lengths, inputs, tag_bitmap in zip(
+            sequence_lengths_list, inputs_list, tag_bitmap_list):
+      sequence_score = text.crf_multitag_sequence_score(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_bitmap=tf.expand_dims(tag_bitmap, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+      tf_sum_sequence_score = self.evaluate(sequence_score)
+      all_indices_list = [
+        single_index_bitmap.nonzero()[0]
+        for single_index_bitmap in tag_bitmap[:sequence_lengths]
+      ]
+      expected_sequence_scores = [
+        self.calculateSequenceScore(inputs, transition_params, indices,
+                                    sequence_lengths)
+        for indices in itertools.product(*all_indices_list)
+      ]
+      expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(
+        expected_sequence_scores)
+      self.assertAllClose(tf_sum_sequence_score,
+                          expected_log_sum_exp_sequence_scores)
+
+  def testCrfUnaryScore(self):
+    inputs = np.array(
+      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    for dtype in (np.int32, np.int64):
+      tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
+      sequence_lengths = np.array(3, dtype=np.int32)
+      unary_score = text.crf_unary_score(
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        inputs=tf.expand_dims(inputs, 0))
+      unary_score = tf.squeeze(unary_score, [0])
+      tf_unary_score = self.evaluate(unary_score)
+      expected_unary_score = sum(inputs[i][tag_indices[i]]
+                                 for i in range(sequence_lengths))
+      self.assertAllClose(tf_unary_score, expected_unary_score)
+
+  def testCrfBinaryScore(self):
+    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    binary_score = text.crf_binary_score(
+      tag_indices=tf.expand_dims(tag_indices, 0),
+      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+      transition_params=tf.constant(transition_params))
+    binary_score = tf.squeeze(binary_score, [0])
+    tf_binary_score = self.evaluate(binary_score)
+    expected_binary_score = sum(
+      transition_params[tag_indices[i], tag_indices[i + 1]]
+      for i in range(sequence_lengths - 1))
+    self.assertAllClose(tf_binary_score, expected_binary_score)
+
+  def testCrfLogNorm(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int64)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[3, -1, 3]],
+               dtype=np.float32),
+    ]
+    tag_indices_list = [
+      np.array([1, 2, 1, 0], dtype=np.int32),
+      np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
+      all_sequence_scores = []
+
+      # Compare the dynamic program with brute force computation.
+      for tag_indices in itertools.product(
+              range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequence_scores.append(
+          text.crf_sequence_score(
+            inputs=tf.expand_dims(inputs, 0),
+            tag_indices=tf.expand_dims(tag_indices, 0),
+            sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+            transition_params=tf.constant(transition_params)))
+
+      brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
+      log_norm = text.crf_log_norm(
+        inputs=tf.expand_dims(inputs, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      log_norm = tf.squeeze(log_norm, [0])
+      tf_brute_force_log_norm, tf_log_norm = self.evaluate(
+        [brute_force_log_norm, log_norm])
+
+      self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
+
+  def testCrfLogNormZeroSeqLength(self):
+    """
+    Test `crf_log_norm` when `sequence_lengths` contains one or more zeros.
+    """
+    inputs = tf.constant(np.ones([2, 10, 5],
+                                 dtype=np.float32))
+    transition_params = tf.constant(np.ones([5, 5],
+                                            dtype=np.float32))
+    sequence_lengths = tf.constant(np.zeros([2],
+                                            dtype=np.int32))
+    expected_log_norm = np.zeros([2], dtype=np.float32)
+    log_norm = text.crf_log_norm(inputs, sequence_lengths, transition_params)
+    tf_log_norm = self.evaluate(log_norm)
+    self.assertAllClose(tf_log_norm, expected_log_norm)
+
+  def testCrfLogLikelihood(self):
+    inputs = np.array(
+      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+    all_sequence_log_likelihoods = []
+
+    # Make sure all probabilities sum to 1.
+    for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+      tag_indices = list(tag_indices)
+      tag_indices.extend([0] * (num_words - sequence_lengths))
+      sequence_log_likelihood, _ = text.crf_log_likelihood(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      all_sequence_log_likelihoods.append(sequence_log_likelihood)
+    total_log_likelihood = tf.reduce_logsumexp(
+      all_sequence_log_likelihoods)
+    tf_total_log_likelihood = self.evaluate(total_log_likelihood)
+    self.assertAllClose(tf_total_log_likelihood, 0.0)
+
+  def testViterbiDecode(self):
+    inputs = np.array(
+      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+
+    all_sequence_scores = []
+    all_sequences = []
+
+    # Compare the dynamic program with brute force computation.
+    for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+      tag_indices = list(tag_indices)
+      tag_indices.extend([0] * (num_words - sequence_lengths))
+      all_sequences.append(tag_indices)
+      sequence_score = text.crf_sequence_score(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+      all_sequence_scores.append(sequence_score)
+
+    tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+    expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+    expected_max_sequence = all_sequences[expected_max_sequence_index]
+    expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+    actual_max_sequence, actual_max_score = text.viterbi_decode(
+      inputs[:sequence_lengths], transition_params)
+
+    self.assertAllClose(actual_max_score, expected_max_score)
+    self.assertEqual(actual_max_sequence,
+                     expected_max_sequence[:sequence_lengths])
+
+  def testCrfDecode(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int64)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[-1, 2, 1]],
+               dtype=np.float32),
+    ]
+    tag_indices_list = [
+      np.array([1, 2, 1, 0], dtype=np.int32),
+      np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
+
+      all_sequence_scores = []
+      all_sequences = []
+
+      # Compare the dynamic program with brute force computation.
+      for tag_indices in itertools.product(
+              range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequences.append(tag_indices)
+        sequence_score = text.crf_sequence_score(
+          inputs=tf.expand_dims(inputs, 0),
+          tag_indices=tf.expand_dims(tag_indices, 0),
+          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+          transition_params=tf.constant(transition_params))
+        sequence_score = tf.squeeze(sequence_score, [0])
+        all_sequence_scores.append(sequence_score)
+
+      tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+      expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+      expected_max_sequence = all_sequences[expected_max_sequence_index]
+      expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+      actual_max_sequence, actual_max_score = text.crf_decode(
+        tf.expand_dims(inputs, 0),
+        tf.constant(transition_params),
+        tf.expand_dims(sequence_lengths, 0))
+      actual_max_sequence = tf.squeeze(actual_max_sequence, [0])
+      actual_max_score = tf.squeeze(actual_max_score, [0])
+      tf_actual_max_sequence, tf_actual_max_score = self.evaluate(
+        [actual_max_sequence, actual_max_score])
+
+      self.assertAllClose(tf_actual_max_score, expected_max_score)
+      self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
+                       expected_max_sequence[:sequence_lengths])
+
+  def testCrfDecodeZeroSeqLength(self):
+    """
+    Test that crf_decode works when sequence_length contains one or more zeros.
+    """
+    inputs = tf.constant(np.ones([2, 10, 5],
+                                 dtype=np.float32))
+    transition_params = tf.constant(np.ones([5, 5],
+                                            dtype=np.float32))
+    sequence_lengths = tf.constant(np.zeros([2],
+                                            dtype=np.int32))
+    tags, scores = text.crf_decode(inputs, transition_params, sequence_lengths)
+    tf_tags, tf_scores = self.evaluate([tags, scores])
+    self.assertEqual(len(tf_tags.shape), 2)
+    self.assertEqual(len(tf_scores.shape), 1)
+
+
+if __name__ == "__main__":
+  tf.test.main()

From 2b9f3f6be327752bebc08092bd26d7d2b8f8029e Mon Sep 17 00:00:00 2001
From: "Dheeraj R. Reddy" <dheeraj98reddy@gmail.com>
Date: Thu, 13 Jun 2019 18:08:12 +0530
Subject: [PATCH 03/52] Port CRF from tf.contrib to tfa.text

---
 tensorflow_addons/text/BUILD           |  14 +
 tensorflow_addons/text/README.md       |   1 +
 tensorflow_addons/text/__init__.py     |  12 +
 tensorflow_addons/text/crf_ops.py      | 464 +++++++++++++++++++++++++
 tensorflow_addons/text/crf_ops_test.py | 358 +++++++++++++++++++
 5 files changed, 849 insertions(+)
 create mode 100644 tensorflow_addons/text/crf_ops.py
 create mode 100644 tensorflow_addons/text/crf_ops_test.py

diff --git a/tensorflow_addons/text/BUILD b/tensorflow_addons/text/BUILD
index 4787cd8c0c..d96bdd582b 100644
--- a/tensorflow_addons/text/BUILD
+++ b/tensorflow_addons/text/BUILD
@@ -6,6 +6,7 @@ py_library(
     name = "text",
     srcs = ([
         "__init__.py",
+        "crf_ops.py",
         "skip_gram_ops.py",
     ]),
     data = [
@@ -15,6 +16,19 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_test(
+    name = "crf_ops_test",
+    size = "small",
+    srcs = [
+        "crf_ops_test.py",
+    ],
+    main = "crf_ops_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":text",
+    ],
+)
+
 py_test(
     name = "skip_gram_ops_test",
     size = "small",
diff --git a/tensorflow_addons/text/README.md b/tensorflow_addons/text/README.md
index 4b4d948363..d6e60a07b9 100644
--- a/tensorflow_addons/text/README.md
+++ b/tensorflow_addons/text/README.md
@@ -4,6 +4,7 @@
 | Submodule  |  Maintainers  | Contact Info   |
 |:---------- |:----------- |:------------- |
 | skip_gram_ops |  |  |
+| crf | Dheeraj R. Reddy | dheeraj98reddy@gmail.com |
 
 ## Components 
 | Submodule  | Text Processing Function |  Reference  |
diff --git a/tensorflow_addons/text/__init__.py b/tensorflow_addons/text/__init__.py
index 05c758e26d..6c67afa387 100644
--- a/tensorflow_addons/text/__init__.py
+++ b/tensorflow_addons/text/__init__.py
@@ -20,3 +20,15 @@
 # Skip Gram Sampling
 from tensorflow_addons.text.skip_gram_ops import skip_gram_sample
 from tensorflow_addons.text.skip_gram_ops import skip_gram_sample_with_text_vocab
+
+from tensorflow_addons.text.crf_ops import crf_binary_score
+from tensorflow_addons.text.crf_ops import crf_decode
+from tensorflow_addons.text.crf_ops import crf_log_likelihood
+from tensorflow_addons.text.crf_ops import crf_log_norm
+from tensorflow_addons.text.crf_ops import crf_multitag_sequence_score
+from tensorflow_addons.text.crf_ops import crf_sequence_score
+from tensorflow_addons.text.crf_ops import crf_unary_score
+from tensorflow_addons.text.crf_ops import CrfDecodeBackwardRnnCell
+from tensorflow_addons.text.crf_ops import CrfDecodeForwardRnnCell
+from tensorflow_addons.text.crf_ops import CrfForwardRnnCell
+from tensorflow_addons.text.crf_ops import viterbi_decode
diff --git a/tensorflow_addons/text/crf_ops.py b/tensorflow_addons/text/crf_ops.py
new file mode 100644
index 0000000000..7acd10924a
--- /dev/null
+++ b/tensorflow_addons/text/crf_ops.py
@@ -0,0 +1,464 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+def crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                       transition_params):
+  """Computes the unnormalized score for a tag sequence.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+        compute the unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
+
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of the single tag.
+  def _single_seq_fn():
+    batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
+
+    example_inds = tf.reshape(
+      tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
+    sequence_scores = tf.gather_nd(
+      tf.squeeze(inputs, [1]),
+      tf.concat([example_inds, tag_indices], axis=1))
+    sequence_scores = tf.where(tf.less_equal(sequence_lengths, 0),
+                               tf.zeros_like(sequence_scores),
+                               sequence_scores)
+    return sequence_scores
+
+  def _multi_seq_fn():
+    # Compute the scores of the given tag sequence.
+    unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
+    binary_scores = crf_binary_score(tag_indices, sequence_lengths,
+                                     transition_params)
+    sequence_scores = unary_scores + binary_scores
+    return sequence_scores
+
+  if inputs.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
+
+
+def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
+                                transition_params):
+  """Computes the unnormalized score of all tag sequences matching tag_bitmap.
+
+  tag_bitmap enables more than one tag to be considered correct at each time
+  step. This is useful when an observed output at a given time step is
+  consistent with more than one tag, and thus the log likelihood of that
+  observation must take into account all possible consistent tags.
+
+  Using one-hot vectors in tag_bitmap gives results identical to
+  crf_sequence_score.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+        representing all active tags at each index for which to calculate the
+        unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
+
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of all active tags.
+  def _single_seq_fn():
+    filtered_inputs = tf.where(
+      tag_bitmap, inputs,
+      tf.fill(tf.shape(inputs), float("-inf")))
+    return tf.reduce_logsumexp(
+      filtered_inputs, axis=[1, 2], keepdims=False)
+
+  def _multi_seq_fn():
+    # Compute the logsumexp of all scores of sequences matching the given tags.
+    filtered_inputs = tf.where(
+      tag_bitmap, inputs,
+      tf.fill(tf.shape(inputs), float("-inf")))
+    return crf_log_norm(
+      inputs=filtered_inputs,
+      sequence_lengths=sequence_lengths,
+      transition_params=transition_params)
+
+  if inputs.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
+
+
+def crf_log_norm(inputs, sequence_lengths, transition_params):
+  """Computes the normalization for a CRF.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    log_norm: A [batch_size] vector of normalizers for a CRF.
+  """
+  # Split up the first and rest of the inputs in preparation for the forward
+  # algorithm.
+  first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
+  first_input = tf.squeeze(first_input, [1])
+
+  # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
+  # the "initial state" (the unary potentials).
+  def _single_seq_fn():
+    log_norm = tf.reduce_logsumexp(first_input, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
+                        tf.zeros_like(log_norm),
+                        log_norm)
+    return log_norm
+
+  def _multi_seq_fn():
+    """Forward computation of alpha values."""
+    rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
+    # Compute the alpha values in the forward algorithm in order to get the
+    # partition function.
+    forward_cell = CrfForwardRnnCell(transition_params)
+    # Sequence length is not allowed to be less than zero.
+    sequence_lengths_less_one = tf.maximum(
+      tf.constant(0, dtype=sequence_lengths.dtype),
+      sequence_lengths - 1)
+
+    forward_layer = tf.keras.layers.RNN(
+      forward_cell,
+      return_sequences=True,
+      return_state=True)
+
+    _, alphas = forward_layer(rest_of_input, first_input)
+
+    log_norm = tf.reduce_logsumexp(alphas, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
+                        tf.zeros_like(log_norm),
+                        log_norm)
+    return log_norm
+
+  if inputs.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
+
+
+def crf_log_likelihood(inputs,
+                       tag_indices,
+                       sequence_lengths,
+                       transition_params=None):
+  """Computes the log-likelihood of tag sequences in a CRF.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+        compute the log-likelihood.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix, if available.
+  Returns:
+    log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
+      each example, given the sequence of tag indices.
+    transition_params: A [num_tags, num_tags] transition matrix. This is either
+        provided by the caller or created in this function.
+  """
+  # Get shape information.
+  num_tags = inputs.shape[2]
+
+  # Get the transition matrix if not provided.
+  if transition_params is None:
+    transition_params = tf.get_variable("transitions", [num_tags, num_tags])
+
+  sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                                       transition_params)
+  log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
+
+  # Normalize the scores to get the log-likelihood per example.
+  log_likelihood = sequence_scores - log_norm
+  return log_likelihood, transition_params
+
+
+def crf_unary_score(tag_indices, sequence_lengths, inputs):
+  """Computes the unary scores of tag sequences.
+
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+  Returns:
+    unary_scores: A [batch_size] vector of unary scores.
+  """
+  batch_size = tf.shape(inputs)[0]
+  max_seq_len = tf.shape(inputs)[1]
+  num_tags = tf.shape(inputs)[2]
+
+  flattened_inputs = tf.reshape(inputs, [-1])
+
+  offsets = tf.expand_dims(
+    tf.range(batch_size) * max_seq_len * num_tags, 1)
+  offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
+  # Use int32 or int64 based on tag_indices' dtype.
+  if tag_indices.dtype == tf.int64:
+    offsets = tf.cast(offsets, tf.int64)
+  flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])
+
+  unary_scores = tf.reshape(
+    tf.gather(flattened_inputs, flattened_tag_indices),
+    [batch_size, max_seq_len])
+
+  masks = tf.sequence_mask(sequence_lengths,
+                           maxlen=tf.shape(tag_indices)[1],
+                           dtype=tf.float32)
+
+  unary_scores = tf.reduce_sum(unary_scores * masks, 1)
+  return unary_scores
+
+
+def crf_binary_score(tag_indices, sequence_lengths, transition_params):
+  """Computes the binary scores of tag sequences.
+
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+  Returns:
+    binary_scores: A [batch_size] vector of binary scores.
+  """
+  # Get shape information.
+  num_tags = tf.shape(transition_params)[0]
+  num_transitions = tf.shape(tag_indices)[1] - 1
+
+  # Truncate by one on each side of the sequence to get the start and end
+  # indices of each transition.
+  start_tag_indices = tf.slice(tag_indices, [0, 0],
+                               [-1, num_transitions])
+  end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])
+
+  # Encode the indices in a flattened representation.
+  flattened_transition_indices = start_tag_indices * num_tags + end_tag_indices
+  flattened_transition_params = tf.reshape(transition_params, [-1])
+
+  # Get the binary scores based on the flattened representation.
+  binary_scores = tf.gather(flattened_transition_params,
+                            flattened_transition_indices)
+
+  masks = tf.sequence_mask(sequence_lengths,
+                           maxlen=tf.shape(tag_indices)[1],
+                           dtype=tf.float32)
+  truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
+  binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
+  return binary_scores
+
+
+class CrfForwardRnnCell(tf.keras.layers.Layer):
+  def __init__(self, transition_params, **kwargs):
+    super(CrfForwardRnnCell, self).__init__(**kwargs)
+    self._transition_params = tf.expand_dims(transition_params, 0)
+    self._num_tags = transition_params.shape[0]
+    self.state_size = self._num_tags
+    self.output_size = self._num_tags
+
+  def build(self, input_shape):
+    super(CrfForwardRnnCell, self).build(input_shape)
+
+  def call(self, inputs, state, training=None):
+    state = tf.expand_dims(state[0], 2)
+    transition_scores = state + self._transition_params
+    new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
+    return new_alphas, new_alphas
+
+
+def viterbi_decode(score, transition_params):
+  """Decode the highest scoring sequence of tags outside of TensorFlow.
+
+  This should only be used at test time.
+
+  Args:
+    score: A [seq_len, num_tags] matrix of unary potentials.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+
+  Returns:
+    viterbi: A [seq_len] list of integers containing the highest scoring tag
+        indices.
+    viterbi_score: A float containing the score for the Viterbi sequence.
+  """
+  trellis = np.zeros_like(score)
+  backpointers = np.zeros_like(score, dtype=np.int32)
+  trellis[0] = score[0]
+
+  for t in range(1, score.shape[0]):
+    v = np.expand_dims(trellis[t - 1], 1) + transition_params
+    trellis[t] = score[t] + np.max(v, 0)
+    backpointers[t] = np.argmax(v, 0)
+
+  viterbi = [np.argmax(trellis[-1])]
+  for bp in reversed(backpointers[1:]):
+    viterbi.append(bp[viterbi[-1]])
+  viterbi.reverse()
+
+  viterbi_score = np.max(trellis[-1])
+  return viterbi, viterbi_score
+
+
+class CrfDecodeForwardRnnCell(tf.keras.layers.Layer):
+  """Computes the forward decoding in a linear-chain CRF.
+  """
+
+  def __init__(self, transition_params, **kwargs):
+    """Initialize the CrfDecodeForwardRnnCell.
+
+    Args:
+      transition_params: A [num_tags, num_tags] matrix of binary
+        potentials. This matrix is expanded into a
+        [1, num_tags, num_tags] in preparation for the broadcast
+        summation occurring within the cell.
+    """
+    super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
+    self._transition_params = tf.expand_dims(transition_params, 0)
+    self._num_tags = transition_params.shape[0]
+    self.state_size = self._num_tags
+    self.output_size = self._num_tags
+
+  def build(self, input_shape):
+    super(CrfDecodeForwardRnnCell, self).build(input_shape)
+
+  def call(self, inputs, state, training=None):
+    state = tf.expand_dims(state[0], 2)
+    transition_scores = state + self._transition_params
+    new_state = inputs + tf.reduce_max(transition_scores, [1])
+    backpointers = tf.argmax(transition_scores, 1)
+    backpointers = tf.cast(backpointers, dtype=tf.int32)
+    return backpointers, new_state
+
+
+class CrfDecodeBackwardRnnCell(tf.keras.layers.Layer):
+  """Computes backward decoding in a linear-chain CRF.
+  """
+
+  def __init__(self, num_tags, **kwargs):
+    """Initialize the CrfDecodeBackwardRnnCell.
+
+    Args:
+      num_tags: An integer. The number of tags.
+    """
+    super(CrfDecodeBackwardRnnCell, self).__init__(**kwargs)
+    self._num_tags = num_tags
+
+    self.state_size = 1
+    self.output_size = 1
+
+  def build(self, input_shape):
+    super(CrfDecodeBackwardRnnCell, self).build(input_shape)
+
+  def call(self, inputs, state, training=None):
+    state = tf.squeeze(state[0], axis=[1])
+    batch_size = tf.shape(inputs)[0]
+    b_indices = tf.range(batch_size)
+    indices = tf.stack([b_indices, state], axis=1)
+    new_tags = tf.expand_dims(tf.gather_nd(inputs, indices), axis=-1)
+
+    return new_tags, new_tags
+
+
+def crf_decode(potentials, transition_params, sequence_length):
+  """Decode the highest scoring sequence of tags in TensorFlow.
+
+  This is a function for tensor.
+
+  Args:
+    potentials: A [batch_size, max_seq_len, num_tags] tensor of
+              unary potentials.
+    transition_params: A [num_tags, num_tags] matrix of
+              binary potentials.
+    sequence_length: A [batch_size] vector of true sequence lengths.
+
+  Returns:
+    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
+                Contains the highest scoring tag indices.
+    best_score: A [batch_size] vector, containing the score of `decode_tags`.
+  """
+
+  # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
+  # and the max activation.
+  def _single_seq_fn():
+    squeezed_potentials = tf.squeeze(potentials, [1])
+    decode_tags = tf.expand_dims(
+      tf.argmax(squeezed_potentials, axis=1), 1)
+    best_score = tf.reduce_max(squeezed_potentials, axis=1)
+    return tf.cast(decode_tags, dtype=tf.int32), best_score
+
+  def _multi_seq_fn():
+    """Decoding of highest scoring sequence."""
+
+    # For simplicity, in shape comments, denote:
+    # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
+    num_tags = potentials.shape[2]
+
+    # Computes forward decoding. Get last score and backpointers.
+    initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1])
+    initial_state = tf.squeeze(initial_state, axis=[1])  # [B, O]
+    inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    # Sequence length is not allowed to be less than zero.
+
+    sequence_length_less_one = tf.maximum(
+      tf.constant(0, dtype=sequence_length.dtype),
+      sequence_length - 1)
+
+    crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
+    crf_fwd_layer = tf.keras.layers.RNN(crf_fwd_cell,
+                                        return_sequences=True,
+                                        return_state=True,
+                                        time_major=False)
+    backpointers, last_score = crf_fwd_layer(inputs, initial_state)
+    backpointers = tf.reverse_sequence(backpointers, sequence_length_less_one, seq_axis=1)
+
+    crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
+    initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32)
+    initial_state = tf.expand_dims(initial_state, axis=-1)
+    crf_bwd_layer = tf.keras.layers.RNN(crf_bwd_cell,
+                                        return_sequences=True,
+                                        return_state=True,
+                                        time_major=False)
+    decode_tags, _ = crf_bwd_layer(backpointers, initial_state)
+
+    decode_tags = tf.squeeze(decode_tags, axis=[2])  # [B, T - 1]
+    decode_tags = tf.concat([initial_state, decode_tags],  # [B, T]
+                            axis=1)
+    decode_tags = tf.reverse_sequence(  # [B, T]
+      decode_tags, sequence_length, seq_axis=1)
+
+    best_score = tf.reduce_max(last_score, axis=1)  # [B]
+    return decode_tags, best_score
+
+  if potentials.shape[1] == 1:
+    return _single_seq_fn()
+  else:
+    return _multi_seq_fn()
diff --git a/tensorflow_addons/text/crf_ops_test.py b/tensorflow_addons/text/crf_ops_test.py
new file mode 100644
index 0000000000..d706992c32
--- /dev/null
+++ b/tensorflow_addons/text/crf_ops_test.py
@@ -0,0 +1,358 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CRF."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow_addons import text
+from tensorflow_addons.utils import test_utils
+
+
+class CrfTest(tf.test.TestCase):
+
+  def calculateSequenceScore(self, inputs, transition_params, tag_indices,
+                             sequence_lengths):
+    expected_unary_score = sum(
+      inputs[i][tag_indices[i]] for i in range(sequence_lengths))
+    expected_binary_score = sum(
+      transition_params[tag_indices[i], tag_indices[i + 1]]
+      for i in range(sequence_lengths - 1))
+    return expected_unary_score + expected_binary_score
+
+  def testCrfSequenceScore(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[4, 5, -3]],
+               dtype=np.float32),
+    ]
+    tag_indices_list = [
+      np.array([1, 2, 1, 0], dtype=np.int32),
+      np.array([1], dtype=np.int32)
+    ]
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      sequence_score = text.crf_sequence_score(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+
+      tf_sequence_score = self.evaluate(sequence_score)
+
+      expected_sequence_score = self.calculateSequenceScore(
+        inputs, transition_params, tag_indices, sequence_lengths)
+      self.assertAllClose(tf_sequence_score, expected_sequence_score)
+
+  def testCrfMultiTagSequenceScore(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[4, 5, -3]],
+               dtype=np.float32),
+    ]
+    tag_bitmap_list = [
+      np.array(
+        [[True, True, False], [True, False, True], [False, True, True],
+         [True, False, True]],
+        dtype=np.bool),
+      np.array([[True, True, False]], dtype=np.bool)
+    ]
+    for sequence_lengths, inputs, tag_bitmap in zip(
+            sequence_lengths_list, inputs_list, tag_bitmap_list):
+      sequence_score = text.crf_multitag_sequence_score(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_bitmap=tf.expand_dims(tag_bitmap, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+      tf_sum_sequence_score = self.evaluate(sequence_score)
+      all_indices_list = [
+        single_index_bitmap.nonzero()[0]
+        for single_index_bitmap in tag_bitmap[:sequence_lengths]
+      ]
+      expected_sequence_scores = [
+        self.calculateSequenceScore(inputs, transition_params, indices,
+                                    sequence_lengths)
+        for indices in itertools.product(*all_indices_list)
+      ]
+      expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(
+        expected_sequence_scores)
+      self.assertAllClose(tf_sum_sequence_score,
+                          expected_log_sum_exp_sequence_scores)
+
+  def testCrfUnaryScore(self):
+    inputs = np.array(
+      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    for dtype in (np.int32, np.int64):
+      tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
+      sequence_lengths = np.array(3, dtype=np.int32)
+      unary_score = text.crf_unary_score(
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        inputs=tf.expand_dims(inputs, 0))
+      unary_score = tf.squeeze(unary_score, [0])
+      tf_unary_score = self.evaluate(unary_score)
+      expected_unary_score = sum(inputs[i][tag_indices[i]]
+                                 for i in range(sequence_lengths))
+      self.assertAllClose(tf_unary_score, expected_unary_score)
+
+  def testCrfBinaryScore(self):
+    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    binary_score = text.crf_binary_score(
+      tag_indices=tf.expand_dims(tag_indices, 0),
+      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+      transition_params=tf.constant(transition_params))
+    binary_score = tf.squeeze(binary_score, [0])
+    tf_binary_score = self.evaluate(binary_score)
+    expected_binary_score = sum(
+      transition_params[tag_indices[i], tag_indices[i + 1]]
+      for i in range(sequence_lengths - 1))
+    self.assertAllClose(tf_binary_score, expected_binary_score)
+
+  def testCrfLogNorm(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int64)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[3, -1, 3]],
+               dtype=np.float32),
+    ]
+    tag_indices_list = [
+      np.array([1, 2, 1, 0], dtype=np.int32),
+      np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
+      all_sequence_scores = []
+
+      # Compare the dynamic program with brute force computation.
+      for tag_indices in itertools.product(
+              range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequence_scores.append(
+          text.crf_sequence_score(
+            inputs=tf.expand_dims(inputs, 0),
+            tag_indices=tf.expand_dims(tag_indices, 0),
+            sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+            transition_params=tf.constant(transition_params)))
+
+      brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
+      log_norm = text.crf_log_norm(
+        inputs=tf.expand_dims(inputs, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      log_norm = tf.squeeze(log_norm, [0])
+      tf_brute_force_log_norm, tf_log_norm = self.evaluate(
+        [brute_force_log_norm, log_norm])
+
+      self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
+
+  def testCrfLogNormZeroSeqLength(self):
+    """
+    Test `crf_log_norm` when `sequence_lengths` contains one or more zeros.
+    """
+    inputs = tf.constant(np.ones([2, 10, 5],
+                                 dtype=np.float32))
+    transition_params = tf.constant(np.ones([5, 5],
+                                            dtype=np.float32))
+    sequence_lengths = tf.constant(np.zeros([2],
+                                            dtype=np.int32))
+    expected_log_norm = np.zeros([2], dtype=np.float32)
+    log_norm = text.crf_log_norm(inputs, sequence_lengths, transition_params)
+    tf_log_norm = self.evaluate(log_norm)
+    self.assertAllClose(tf_log_norm, expected_log_norm)
+
+  def testCrfLogLikelihood(self):
+    inputs = np.array(
+      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+    all_sequence_log_likelihoods = []
+
+    # Make sure all probabilities sum to 1.
+    for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+      tag_indices = list(tag_indices)
+      tag_indices.extend([0] * (num_words - sequence_lengths))
+      sequence_log_likelihood, _ = text.crf_log_likelihood(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      all_sequence_log_likelihoods.append(sequence_log_likelihood)
+    total_log_likelihood = tf.reduce_logsumexp(
+      all_sequence_log_likelihoods)
+    tf_total_log_likelihood = self.evaluate(total_log_likelihood)
+    self.assertAllClose(tf_total_log_likelihood, 0.0)
+
+  def testViterbiDecode(self):
+    inputs = np.array(
+      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+
+    all_sequence_scores = []
+    all_sequences = []
+
+    # Compare the dynamic program with brute force computation.
+    for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+      tag_indices = list(tag_indices)
+      tag_indices.extend([0] * (num_words - sequence_lengths))
+      all_sequences.append(tag_indices)
+      sequence_score = text.crf_sequence_score(
+        inputs=tf.expand_dims(inputs, 0),
+        tag_indices=tf.expand_dims(tag_indices, 0),
+        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+        transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+      all_sequence_scores.append(sequence_score)
+
+    tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+    expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+    expected_max_sequence = all_sequences[expected_max_sequence_index]
+    expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+    actual_max_sequence, actual_max_score = text.viterbi_decode(
+      inputs[:sequence_lengths], transition_params)
+
+    self.assertAllClose(actual_max_score, expected_max_score)
+    self.assertEqual(actual_max_sequence,
+                     expected_max_sequence[:sequence_lengths])
+
+  def testCrfDecode(self):
+    transition_params = np.array(
+      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+      np.array(3, dtype=np.int32),
+      np.array(1, dtype=np.int64)
+    ]
+    inputs_list = [
+      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+               dtype=np.float32),
+      np.array([[-1, 2, 1]],
+               dtype=np.float32),
+    ]
+    tag_indices_list = [
+      np.array([1, 2, 1, 0], dtype=np.int32),
+      np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
+
+      all_sequence_scores = []
+      all_sequences = []
+
+      # Compare the dynamic program with brute force computation.
+      for tag_indices in itertools.product(
+              range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequences.append(tag_indices)
+        sequence_score = text.crf_sequence_score(
+          inputs=tf.expand_dims(inputs, 0),
+          tag_indices=tf.expand_dims(tag_indices, 0),
+          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+          transition_params=tf.constant(transition_params))
+        sequence_score = tf.squeeze(sequence_score, [0])
+        all_sequence_scores.append(sequence_score)
+
+      tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+      expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+      expected_max_sequence = all_sequences[expected_max_sequence_index]
+      expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+      actual_max_sequence, actual_max_score = text.crf_decode(
+        tf.expand_dims(inputs, 0),
+        tf.constant(transition_params),
+        tf.expand_dims(sequence_lengths, 0))
+      actual_max_sequence = tf.squeeze(actual_max_sequence, [0])
+      actual_max_score = tf.squeeze(actual_max_score, [0])
+      tf_actual_max_sequence, tf_actual_max_score = self.evaluate(
+        [actual_max_sequence, actual_max_score])
+
+      self.assertAllClose(tf_actual_max_score, expected_max_score)
+      self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
+                       expected_max_sequence[:sequence_lengths])
+
+  def testCrfDecodeZeroSeqLength(self):
+    """
+    Test that crf_decode works when sequence_length contains one or more zeros.
+    """
+    inputs = tf.constant(np.ones([2, 10, 5],
+                                 dtype=np.float32))
+    transition_params = tf.constant(np.ones([5, 5],
+                                            dtype=np.float32))
+    sequence_lengths = tf.constant(np.zeros([2],
+                                            dtype=np.int32))
+    tags, scores = text.crf_decode(inputs, transition_params, sequence_lengths)
+    tf_tags, tf_scores = self.evaluate([tags, scores])
+    self.assertEqual(len(tf_tags.shape), 2)
+    self.assertEqual(len(tf_scores.shape), 1)
+
+
+if __name__ == "__main__":
+  tf.test.main()

From 707ed994ceae02d7e1f2d3b94d3032f2ec6e570c Mon Sep 17 00:00:00 2001
From: Dheeraj Rajaram Reddy <dheeraj98reddy@gmail.com>
Date: Thu, 20 Jun 2019 00:15:09 +0530
Subject: [PATCH 04/52] Format using make code-format

---
 tensorflow_addons/text/crf_ops.py      | 761 ++++++++++++-------------
 tensorflow_addons/text/crf_ops_test.py | 633 ++++++++++----------
 2 files changed, 687 insertions(+), 707 deletions(-)

diff --git a/tensorflow_addons/text/crf_ops.py b/tensorflow_addons/text/crf_ops.py
index 7acd10924a..9e5fd02051 100644
--- a/tensorflow_addons/text/crf_ops.py
+++ b/tensorflow_addons/text/crf_ops.py
@@ -23,442 +23,435 @@
 
 def crf_sequence_score(inputs, tag_indices, sequence_lengths,
                        transition_params):
-  """Computes the unnormalized score for a tag sequence.
-
-  Args:
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-        to use as input to the CRF layer.
-    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
-        compute the unnormalized score.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] transition matrix.
-  Returns:
-    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
-  """
-
-  # If max_seq_len is 1, we skip the score calculation and simply gather the
-  # unary potentials of the single tag.
-  def _single_seq_fn():
-    batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
-
-    example_inds = tf.reshape(
-      tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
-    sequence_scores = tf.gather_nd(
-      tf.squeeze(inputs, [1]),
-      tf.concat([example_inds, tag_indices], axis=1))
-    sequence_scores = tf.where(tf.less_equal(sequence_lengths, 0),
-                               tf.zeros_like(sequence_scores),
-                               sequence_scores)
-    return sequence_scores
-
-  def _multi_seq_fn():
-    # Compute the scores of the given tag sequence.
-    unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
-    binary_scores = crf_binary_score(tag_indices, sequence_lengths,
-                                     transition_params)
-    sequence_scores = unary_scores + binary_scores
-    return sequence_scores
-
-  if inputs.shape[1] == 1:
-    return _single_seq_fn()
-  else:
-    return _multi_seq_fn()
+    """Computes the unnormalized score for a tag sequence.
+
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+          compute the unnormalized score.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix.
+    Returns:
+      sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+    """
+
+    # If max_seq_len is 1, we skip the score calculation and simply gather the
+    # unary potentials of the single tag.
+    def _single_seq_fn():
+        batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
+
+        example_inds = tf.reshape(
+            tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
+        sequence_scores = tf.gather_nd(
+            tf.squeeze(inputs, [1]),
+            tf.concat([example_inds, tag_indices], axis=1))
+        sequence_scores = tf.where(
+            tf.less_equal(sequence_lengths, 0), tf.zeros_like(sequence_scores),
+            sequence_scores)
+        return sequence_scores
+
+    def _multi_seq_fn():
+        # Compute the scores of the given tag sequence.
+        unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
+        binary_scores = crf_binary_score(tag_indices, sequence_lengths,
+                                         transition_params)
+        sequence_scores = unary_scores + binary_scores
+        return sequence_scores
+
+    if inputs.shape[1] == 1:
+        return _single_seq_fn()
+    else:
+        return _multi_seq_fn()
 
 
 def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
                                 transition_params):
-  """Computes the unnormalized score of all tag sequences matching tag_bitmap.
-
-  tag_bitmap enables more than one tag to be considered correct at each time
-  step. This is useful when an observed output at a given time step is
-  consistent with more than one tag, and thus the log likelihood of that
-  observation must take into account all possible consistent tags.
-
-  Using one-hot vectors in tag_bitmap gives results identical to
-  crf_sequence_score.
-
-  Args:
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-        to use as input to the CRF layer.
-    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
-        representing all active tags at each index for which to calculate the
-        unnormalized score.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] transition matrix.
-  Returns:
-    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
-  """
-
-  # If max_seq_len is 1, we skip the score calculation and simply gather the
-  # unary potentials of all active tags.
-  def _single_seq_fn():
-    filtered_inputs = tf.where(
-      tag_bitmap, inputs,
-      tf.fill(tf.shape(inputs), float("-inf")))
-    return tf.reduce_logsumexp(
-      filtered_inputs, axis=[1, 2], keepdims=False)
-
-  def _multi_seq_fn():
-    # Compute the logsumexp of all scores of sequences matching the given tags.
-    filtered_inputs = tf.where(
-      tag_bitmap, inputs,
-      tf.fill(tf.shape(inputs), float("-inf")))
-    return crf_log_norm(
-      inputs=filtered_inputs,
-      sequence_lengths=sequence_lengths,
-      transition_params=transition_params)
-
-  if inputs.shape[1] == 1:
-    return _single_seq_fn()
-  else:
-    return _multi_seq_fn()
+    """Computes the unnormalized score of all tag sequences matching
+    tag_bitmap.
+
+    tag_bitmap enables more than one tag to be considered correct at each time
+    step. This is useful when an observed output at a given time step is
+    consistent with more than one tag, and thus the log likelihood of that
+    observation must take into account all possible consistent tags.
+
+    Using one-hot vectors in tag_bitmap gives results identical to
+    crf_sequence_score.
+
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+          representing all active tags at each index for which to calculate the
+          unnormalized score.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix.
+    Returns:
+      sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+    """
+
+    # If max_seq_len is 1, we skip the score calculation and simply gather the
+    # unary potentials of all active tags.
+    def _single_seq_fn():
+        filtered_inputs = tf.where(tag_bitmap, inputs,
+                                   tf.fill(tf.shape(inputs), float("-inf")))
+        return tf.reduce_logsumexp(
+            filtered_inputs, axis=[1, 2], keepdims=False)
+
+    def _multi_seq_fn():
+        # Compute the logsumexp of all scores of sequences matching the given tags.
+        filtered_inputs = tf.where(tag_bitmap, inputs,
+                                   tf.fill(tf.shape(inputs), float("-inf")))
+        return crf_log_norm(
+            inputs=filtered_inputs,
+            sequence_lengths=sequence_lengths,
+            transition_params=transition_params)
+
+    if inputs.shape[1] == 1:
+        return _single_seq_fn()
+    else:
+        return _multi_seq_fn()
 
 
 def crf_log_norm(inputs, sequence_lengths, transition_params):
-  """Computes the normalization for a CRF.
-
-  Args:
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-        to use as input to the CRF layer.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] transition matrix.
-  Returns:
-    log_norm: A [batch_size] vector of normalizers for a CRF.
-  """
-  # Split up the first and rest of the inputs in preparation for the forward
-  # algorithm.
-  first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
-  first_input = tf.squeeze(first_input, [1])
-
-  # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
-  # the "initial state" (the unary potentials).
-  def _single_seq_fn():
-    log_norm = tf.reduce_logsumexp(first_input, [1])
-    # Mask `log_norm` of the sequences with length <= zero.
-    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
-                        tf.zeros_like(log_norm),
-                        log_norm)
-    return log_norm
-
-  def _multi_seq_fn():
-    """Forward computation of alpha values."""
-    rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
-    # Compute the alpha values in the forward algorithm in order to get the
-    # partition function.
-    forward_cell = CrfForwardRnnCell(transition_params)
-    # Sequence length is not allowed to be less than zero.
-    sequence_lengths_less_one = tf.maximum(
-      tf.constant(0, dtype=sequence_lengths.dtype),
-      sequence_lengths - 1)
-
-    forward_layer = tf.keras.layers.RNN(
-      forward_cell,
-      return_sequences=True,
-      return_state=True)
-
-    _, alphas = forward_layer(rest_of_input, first_input)
-
-    log_norm = tf.reduce_logsumexp(alphas, [1])
-    # Mask `log_norm` of the sequences with length <= zero.
-    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
-                        tf.zeros_like(log_norm),
-                        log_norm)
-    return log_norm
-
-  if inputs.shape[1] == 1:
-    return _single_seq_fn()
-  else:
-    return _multi_seq_fn()
+    """Computes the normalization for a CRF.
+
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix.
+    Returns:
+      log_norm: A [batch_size] vector of normalizers for a CRF.
+    """
+    # Split up the first and rest of the inputs in preparation for the forward
+    # algorithm.
+    first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
+    first_input = tf.squeeze(first_input, [1])
+
+    # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
+    # the "initial state" (the unary potentials).
+    def _single_seq_fn():
+        log_norm = tf.reduce_logsumexp(first_input, [1])
+        # Mask `log_norm` of the sequences with length <= zero.
+        log_norm = tf.where(
+            tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm),
+            log_norm)
+        return log_norm
+
+    def _multi_seq_fn():
+        """Forward computation of alpha values."""
+        rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
+        # Compute the alpha values in the forward algorithm in order to get the
+        # partition function.
+        forward_cell = CrfForwardRnnCell(transition_params)
+        # Sequence length is not allowed to be less than zero.
+        sequence_lengths_less_one = tf.maximum(
+            tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1)
+
+        forward_layer = tf.keras.layers.RNN(
+            forward_cell, return_sequences=True, return_state=True)
+
+        _, alphas = forward_layer(rest_of_input, first_input)
+
+        log_norm = tf.reduce_logsumexp(alphas, [1])
+        # Mask `log_norm` of the sequences with length <= zero.
+        log_norm = tf.where(
+            tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm),
+            log_norm)
+        return log_norm
+
+    if inputs.shape[1] == 1:
+        return _single_seq_fn()
+    else:
+        return _multi_seq_fn()
 
 
 def crf_log_likelihood(inputs,
                        tag_indices,
                        sequence_lengths,
                        transition_params=None):
-  """Computes the log-likelihood of tag sequences in a CRF.
-
-  Args:
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-        to use as input to the CRF layer.
-    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
-        compute the log-likelihood.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] transition matrix, if available.
-  Returns:
-    log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
-      each example, given the sequence of tag indices.
-    transition_params: A [num_tags, num_tags] transition matrix. This is either
-        provided by the caller or created in this function.
-  """
-  # Get shape information.
-  num_tags = inputs.shape[2]
-
-  # Get the transition matrix if not provided.
-  if transition_params is None:
-    transition_params = tf.get_variable("transitions", [num_tags, num_tags])
-
-  sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
-                                       transition_params)
-  log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
-
-  # Normalize the scores to get the log-likelihood per example.
-  log_likelihood = sequence_scores - log_norm
-  return log_likelihood, transition_params
+    """Computes the log-likelihood of tag sequences in a CRF.
 
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+          compute the log-likelihood.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix, if available.
+    Returns:
+      log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
+        each example, given the sequence of tag indices.
+      transition_params: A [num_tags, num_tags] transition matrix. This is either
+          provided by the caller or created in this function.
+    """
+    # Get shape information.
+    num_tags = inputs.shape[2]
 
-def crf_unary_score(tag_indices, sequence_lengths, inputs):
-  """Computes the unary scores of tag sequences.
+    # Get the transition matrix if not provided.
+    if transition_params is None:
+        transition_params = tf.get_variable("transitions",
+                                            [num_tags, num_tags])
 
-  Args:
-    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
-  Returns:
-    unary_scores: A [batch_size] vector of unary scores.
-  """
-  batch_size = tf.shape(inputs)[0]
-  max_seq_len = tf.shape(inputs)[1]
-  num_tags = tf.shape(inputs)[2]
+    sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                                         transition_params)
+    log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
 
-  flattened_inputs = tf.reshape(inputs, [-1])
+    # Normalize the scores to get the log-likelihood per example.
+    log_likelihood = sequence_scores - log_norm
+    return log_likelihood, transition_params
 
-  offsets = tf.expand_dims(
-    tf.range(batch_size) * max_seq_len * num_tags, 1)
-  offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
-  # Use int32 or int64 based on tag_indices' dtype.
-  if tag_indices.dtype == tf.int64:
-    offsets = tf.cast(offsets, tf.int64)
-  flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])
 
-  unary_scores = tf.reshape(
-    tf.gather(flattened_inputs, flattened_tag_indices),
-    [batch_size, max_seq_len])
+def crf_unary_score(tag_indices, sequence_lengths, inputs):
+    """Computes the unary scores of tag sequences.
 
-  masks = tf.sequence_mask(sequence_lengths,
-                           maxlen=tf.shape(tag_indices)[1],
-                           dtype=tf.float32)
+    Args:
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+    Returns:
+      unary_scores: A [batch_size] vector of unary scores.
+    """
+    batch_size = tf.shape(inputs)[0]
+    max_seq_len = tf.shape(inputs)[1]
+    num_tags = tf.shape(inputs)[2]
 
-  unary_scores = tf.reduce_sum(unary_scores * masks, 1)
-  return unary_scores
+    flattened_inputs = tf.reshape(inputs, [-1])
 
+    offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1)
+    offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
+    # Use int32 or int64 based on tag_indices' dtype.
+    if tag_indices.dtype == tf.int64:
+        offsets = tf.cast(offsets, tf.int64)
+    flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])
 
-def crf_binary_score(tag_indices, sequence_lengths, transition_params):
-  """Computes the binary scores of tag sequences.
-
-  Args:
-    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] matrix of binary potentials.
-  Returns:
-    binary_scores: A [batch_size] vector of binary scores.
-  """
-  # Get shape information.
-  num_tags = tf.shape(transition_params)[0]
-  num_transitions = tf.shape(tag_indices)[1] - 1
-
-  # Truncate by one on each side of the sequence to get the start and end
-  # indices of each transition.
-  start_tag_indices = tf.slice(tag_indices, [0, 0],
-                               [-1, num_transitions])
-  end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])
-
-  # Encode the indices in a flattened representation.
-  flattened_transition_indices = start_tag_indices * num_tags + end_tag_indices
-  flattened_transition_params = tf.reshape(transition_params, [-1])
-
-  # Get the binary scores based on the flattened representation.
-  binary_scores = tf.gather(flattened_transition_params,
-                            flattened_transition_indices)
-
-  masks = tf.sequence_mask(sequence_lengths,
-                           maxlen=tf.shape(tag_indices)[1],
-                           dtype=tf.float32)
-  truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
-  binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
-  return binary_scores
+    unary_scores = tf.reshape(
+        tf.gather(flattened_inputs, flattened_tag_indices),
+        [batch_size, max_seq_len])
 
+    masks = tf.sequence_mask(
+        sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32)
 
-class CrfForwardRnnCell(tf.keras.layers.Layer):
-  def __init__(self, transition_params, **kwargs):
-    super(CrfForwardRnnCell, self).__init__(**kwargs)
-    self._transition_params = tf.expand_dims(transition_params, 0)
-    self._num_tags = transition_params.shape[0]
-    self.state_size = self._num_tags
-    self.output_size = self._num_tags
+    unary_scores = tf.reduce_sum(unary_scores * masks, 1)
+    return unary_scores
 
-  def build(self, input_shape):
-    super(CrfForwardRnnCell, self).build(input_shape)
 
-  def call(self, inputs, state, training=None):
-    state = tf.expand_dims(state[0], 2)
-    transition_scores = state + self._transition_params
-    new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
-    return new_alphas, new_alphas
+def crf_binary_score(tag_indices, sequence_lengths, transition_params):
+    """Computes the binary scores of tag sequences.
 
+    Args:
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] matrix of binary potentials.
+    Returns:
+      binary_scores: A [batch_size] vector of binary scores.
+    """
+    # Get shape information.
+    num_tags = tf.shape(transition_params)[0]
+    num_transitions = tf.shape(tag_indices)[1] - 1
 
-def viterbi_decode(score, transition_params):
-  """Decode the highest scoring sequence of tags outside of TensorFlow.
+    # Truncate by one on each side of the sequence to get the start and end
+    # indices of each transition.
+    start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions])
+    end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])
 
-  This should only be used at test time.
+    # Encode the indices in a flattened representation.
+    flattened_transition_indices = start_tag_indices * num_tags + end_tag_indices
+    flattened_transition_params = tf.reshape(transition_params, [-1])
 
-  Args:
-    score: A [seq_len, num_tags] matrix of unary potentials.
-    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+    # Get the binary scores based on the flattened representation.
+    binary_scores = tf.gather(flattened_transition_params,
+                              flattened_transition_indices)
 
-  Returns:
-    viterbi: A [seq_len] list of integers containing the highest scoring tag
-        indices.
-    viterbi_score: A float containing the score for the Viterbi sequence.
-  """
-  trellis = np.zeros_like(score)
-  backpointers = np.zeros_like(score, dtype=np.int32)
-  trellis[0] = score[0]
+    masks = tf.sequence_mask(
+        sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32)
+    truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
+    binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
+    return binary_scores
 
-  for t in range(1, score.shape[0]):
-    v = np.expand_dims(trellis[t - 1], 1) + transition_params
-    trellis[t] = score[t] + np.max(v, 0)
-    backpointers[t] = np.argmax(v, 0)
 
-  viterbi = [np.argmax(trellis[-1])]
-  for bp in reversed(backpointers[1:]):
-    viterbi.append(bp[viterbi[-1]])
-  viterbi.reverse()
+class CrfForwardRnnCell(tf.keras.layers.Layer):
+    def __init__(self, transition_params, **kwargs):
+        super(CrfForwardRnnCell, self).__init__(**kwargs)
+        self._transition_params = tf.expand_dims(transition_params, 0)
+        self._num_tags = transition_params.shape[0]
+        self.state_size = self._num_tags
+        self.output_size = self._num_tags
 
-  viterbi_score = np.max(trellis[-1])
-  return viterbi, viterbi_score
+    def build(self, input_shape):
+        super(CrfForwardRnnCell, self).build(input_shape)
 
+    def call(self, inputs, state, training=None):
+        state = tf.expand_dims(state[0], 2)
+        transition_scores = state + self._transition_params
+        new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
+        return new_alphas, new_alphas
 
-class CrfDecodeForwardRnnCell(tf.keras.layers.Layer):
-  """Computes the forward decoding in a linear-chain CRF.
-  """
 
-  def __init__(self, transition_params, **kwargs):
-    """Initialize the CrfDecodeForwardRnnCell.
+def viterbi_decode(score, transition_params):
+    """Decode the highest scoring sequence of tags outside of TensorFlow.
+
+    This should only be used at test time.
 
     Args:
-      transition_params: A [num_tags, num_tags] matrix of binary
-        potentials. This matrix is expanded into a
-        [1, num_tags, num_tags] in preparation for the broadcast
-        summation occurring within the cell.
+      score: A [seq_len, num_tags] matrix of unary potentials.
+      transition_params: A [num_tags, num_tags] matrix of binary potentials.
+
+    Returns:
+      viterbi: A [seq_len] list of integers containing the highest scoring tag
+          indices.
+      viterbi_score: A float containing the score for the Viterbi sequence.
     """
-    super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
-    self._transition_params = tf.expand_dims(transition_params, 0)
-    self._num_tags = transition_params.shape[0]
-    self.state_size = self._num_tags
-    self.output_size = self._num_tags
+    trellis = np.zeros_like(score)
+    backpointers = np.zeros_like(score, dtype=np.int32)
+    trellis[0] = score[0]
 
-  def build(self, input_shape):
-    super(CrfDecodeForwardRnnCell, self).build(input_shape)
+    for t in range(1, score.shape[0]):
+        v = np.expand_dims(trellis[t - 1], 1) + transition_params
+        trellis[t] = score[t] + np.max(v, 0)
+        backpointers[t] = np.argmax(v, 0)
 
-  def call(self, inputs, state, training=None):
-    state = tf.expand_dims(state[0], 2)
-    transition_scores = state + self._transition_params
-    new_state = inputs + tf.reduce_max(transition_scores, [1])
-    backpointers = tf.argmax(transition_scores, 1)
-    backpointers = tf.cast(backpointers, dtype=tf.int32)
-    return backpointers, new_state
+    viterbi = [np.argmax(trellis[-1])]
+    for bp in reversed(backpointers[1:]):
+        viterbi.append(bp[viterbi[-1]])
+    viterbi.reverse()
+
+    viterbi_score = np.max(trellis[-1])
+    return viterbi, viterbi_score
+
+
+class CrfDecodeForwardRnnCell(tf.keras.layers.Layer):
+    """Computes the forward decoding in a linear-chain CRF."""
+
+    def __init__(self, transition_params, **kwargs):
+        """Initialize the CrfDecodeForwardRnnCell.
+
+        Args:
+          transition_params: A [num_tags, num_tags] matrix of binary
+            potentials. This matrix is expanded into a
+            [1, num_tags, num_tags] in preparation for the broadcast
+            summation occurring within the cell.
+        """
+        super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
+        self._transition_params = tf.expand_dims(transition_params, 0)
+        self._num_tags = transition_params.shape[0]
+        self.state_size = self._num_tags
+        self.output_size = self._num_tags
+
+    def build(self, input_shape):
+        super(CrfDecodeForwardRnnCell, self).build(input_shape)
+
+    def call(self, inputs, state, training=None):
+        state = tf.expand_dims(state[0], 2)
+        transition_scores = state + self._transition_params
+        new_state = inputs + tf.reduce_max(transition_scores, [1])
+        backpointers = tf.argmax(transition_scores, 1)
+        backpointers = tf.cast(backpointers, dtype=tf.int32)
+        return backpointers, new_state
 
 
 class CrfDecodeBackwardRnnCell(tf.keras.layers.Layer):
-  """Computes backward decoding in a linear-chain CRF.
-  """
+    """Computes backward decoding in a linear-chain CRF."""
 
-  def __init__(self, num_tags, **kwargs):
-    """Initialize the CrfDecodeBackwardRnnCell.
+    def __init__(self, num_tags, **kwargs):
+        """Initialize the CrfDecodeBackwardRnnCell.
 
-    Args:
-      num_tags: An integer. The number of tags.
-    """
-    super(CrfDecodeBackwardRnnCell, self).__init__(**kwargs)
-    self._num_tags = num_tags
+        Args:
+          num_tags: An integer. The number of tags.
+        """
+        super(CrfDecodeBackwardRnnCell, self).__init__(**kwargs)
+        self._num_tags = num_tags
 
-    self.state_size = 1
-    self.output_size = 1
+        self.state_size = 1
+        self.output_size = 1
 
-  def build(self, input_shape):
-    super(CrfDecodeBackwardRnnCell, self).build(input_shape)
+    def build(self, input_shape):
+        super(CrfDecodeBackwardRnnCell, self).build(input_shape)
 
-  def call(self, inputs, state, training=None):
-    state = tf.squeeze(state[0], axis=[1])
-    batch_size = tf.shape(inputs)[0]
-    b_indices = tf.range(batch_size)
-    indices = tf.stack([b_indices, state], axis=1)
-    new_tags = tf.expand_dims(tf.gather_nd(inputs, indices), axis=-1)
+    def call(self, inputs, state, training=None):
+        state = tf.squeeze(state[0], axis=[1])
+        batch_size = tf.shape(inputs)[0]
+        b_indices = tf.range(batch_size)
+        indices = tf.stack([b_indices, state], axis=1)
+        new_tags = tf.expand_dims(tf.gather_nd(inputs, indices), axis=-1)
 
-    return new_tags, new_tags
+        return new_tags, new_tags
 
 
 def crf_decode(potentials, transition_params, sequence_length):
-  """Decode the highest scoring sequence of tags in TensorFlow.
-
-  This is a function for tensor.
-
-  Args:
-    potentials: A [batch_size, max_seq_len, num_tags] tensor of
-              unary potentials.
-    transition_params: A [num_tags, num_tags] matrix of
-              binary potentials.
-    sequence_length: A [batch_size] vector of true sequence lengths.
-
-  Returns:
-    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
-                Contains the highest scoring tag indices.
-    best_score: A [batch_size] vector, containing the score of `decode_tags`.
-  """
-
-  # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
-  # and the max activation.
-  def _single_seq_fn():
-    squeezed_potentials = tf.squeeze(potentials, [1])
-    decode_tags = tf.expand_dims(
-      tf.argmax(squeezed_potentials, axis=1), 1)
-    best_score = tf.reduce_max(squeezed_potentials, axis=1)
-    return tf.cast(decode_tags, dtype=tf.int32), best_score
-
-  def _multi_seq_fn():
-    """Decoding of highest scoring sequence."""
-
-    # For simplicity, in shape comments, denote:
-    # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
-    num_tags = potentials.shape[2]
-
-    # Computes forward decoding. Get last score and backpointers.
-    initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1])
-    initial_state = tf.squeeze(initial_state, axis=[1])  # [B, O]
-    inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
-    # Sequence length is not allowed to be less than zero.
-
-    sequence_length_less_one = tf.maximum(
-      tf.constant(0, dtype=sequence_length.dtype),
-      sequence_length - 1)
-
-    crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
-    crf_fwd_layer = tf.keras.layers.RNN(crf_fwd_cell,
-                                        return_sequences=True,
-                                        return_state=True,
-                                        time_major=False)
-    backpointers, last_score = crf_fwd_layer(inputs, initial_state)
-    backpointers = tf.reverse_sequence(backpointers, sequence_length_less_one, seq_axis=1)
-
-    crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
-    initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32)
-    initial_state = tf.expand_dims(initial_state, axis=-1)
-    crf_bwd_layer = tf.keras.layers.RNN(crf_bwd_cell,
-                                        return_sequences=True,
-                                        return_state=True,
-                                        time_major=False)
-    decode_tags, _ = crf_bwd_layer(backpointers, initial_state)
-
-    decode_tags = tf.squeeze(decode_tags, axis=[2])  # [B, T - 1]
-    decode_tags = tf.concat([initial_state, decode_tags],  # [B, T]
-                            axis=1)
-    decode_tags = tf.reverse_sequence(  # [B, T]
-      decode_tags, sequence_length, seq_axis=1)
-
-    best_score = tf.reduce_max(last_score, axis=1)  # [B]
-    return decode_tags, best_score
-
-  if potentials.shape[1] == 1:
-    return _single_seq_fn()
-  else:
-    return _multi_seq_fn()
+    """Decode the highest scoring sequence of tags in TensorFlow.
+
+    This is a function for tensor.
+
+    Args:
+      potentials: A [batch_size, max_seq_len, num_tags] tensor of
+                unary potentials.
+      transition_params: A [num_tags, num_tags] matrix of
+                binary potentials.
+      sequence_length: A [batch_size] vector of true sequence lengths.
+
+    Returns:
+      decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
+                  Contains the highest scoring tag indices.
+      best_score: A [batch_size] vector, containing the score of `decode_tags`.
+    """
+
+    # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
+    # and the max activation.
+    def _single_seq_fn():
+        squeezed_potentials = tf.squeeze(potentials, [1])
+        decode_tags = tf.expand_dims(tf.argmax(squeezed_potentials, axis=1), 1)
+        best_score = tf.reduce_max(squeezed_potentials, axis=1)
+        return tf.cast(decode_tags, dtype=tf.int32), best_score
+
+    def _multi_seq_fn():
+        """Decoding of highest scoring sequence."""
+
+        # For simplicity, in shape comments, denote:
+        # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
+        num_tags = potentials.shape[2]
+
+        # Computes forward decoding. Get last score and backpointers.
+        initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1])
+        initial_state = tf.squeeze(initial_state, axis=[1])  # [B, O]
+        inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+        # Sequence length is not allowed to be less than zero.
+
+        sequence_length_less_one = tf.maximum(
+            tf.constant(0, dtype=sequence_length.dtype), sequence_length - 1)
+
+        crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
+        crf_fwd_layer = tf.keras.layers.RNN(
+            crf_fwd_cell,
+            return_sequences=True,
+            return_state=True,
+            time_major=False)
+        backpointers, last_score = crf_fwd_layer(inputs, initial_state)
+        backpointers = tf.reverse_sequence(
+            backpointers, sequence_length_less_one, seq_axis=1)
+
+        crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
+        initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32)
+        initial_state = tf.expand_dims(initial_state, axis=-1)
+        crf_bwd_layer = tf.keras.layers.RNN(
+            crf_bwd_cell,
+            return_sequences=True,
+            return_state=True,
+            time_major=False)
+        decode_tags, _ = crf_bwd_layer(backpointers, initial_state)
+
+        decode_tags = tf.squeeze(decode_tags, axis=[2])  # [B, T - 1]
+        decode_tags = tf.concat(
+            [initial_state, decode_tags],  # [B, T]
+            axis=1)
+        decode_tags = tf.reverse_sequence(  # [B, T]
+            decode_tags, sequence_length, seq_axis=1)
+
+        best_score = tf.reduce_max(last_score, axis=1)  # [B]
+        return decode_tags, best_score
+
+    if potentials.shape[1] == 1:
+        return _single_seq_fn()
+    else:
+        return _multi_seq_fn()
diff --git a/tensorflow_addons/text/crf_ops_test.py b/tensorflow_addons/text/crf_ops_test.py
index d706992c32..ad22d95a2f 100644
--- a/tensorflow_addons/text/crf_ops_test.py
+++ b/tensorflow_addons/text/crf_ops_test.py
@@ -28,331 +28,318 @@
 
 
 class CrfTest(tf.test.TestCase):
-
-  def calculateSequenceScore(self, inputs, transition_params, tag_indices,
-                             sequence_lengths):
-    expected_unary_score = sum(
-      inputs[i][tag_indices[i]] for i in range(sequence_lengths))
-    expected_binary_score = sum(
-      transition_params[tag_indices[i], tag_indices[i + 1]]
-      for i in range(sequence_lengths - 1))
-    return expected_unary_score + expected_binary_score
-
-  def testCrfSequenceScore(self):
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    # Test both the length-1 and regular cases.
-    sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int32)
-    ]
-    inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
-               dtype=np.float32),
-      np.array([[4, 5, -3]],
-               dtype=np.float32),
-    ]
-    tag_indices_list = [
-      np.array([1, 2, 1, 0], dtype=np.int32),
-      np.array([1], dtype=np.int32)
-    ]
-    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
-                                                     inputs_list,
-                                                     tag_indices_list):
-      sequence_score = text.crf_sequence_score(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      sequence_score = tf.squeeze(sequence_score, [0])
-
-      tf_sequence_score = self.evaluate(sequence_score)
-
-      expected_sequence_score = self.calculateSequenceScore(
-        inputs, transition_params, tag_indices, sequence_lengths)
-      self.assertAllClose(tf_sequence_score, expected_sequence_score)
-
-  def testCrfMultiTagSequenceScore(self):
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    # Test both the length-1 and regular cases.
-    sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int32)
-    ]
-    inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
-               dtype=np.float32),
-      np.array([[4, 5, -3]],
-               dtype=np.float32),
-    ]
-    tag_bitmap_list = [
-      np.array(
-        [[True, True, False], [True, False, True], [False, True, True],
-         [True, False, True]],
-        dtype=np.bool),
-      np.array([[True, True, False]], dtype=np.bool)
-    ]
-    for sequence_lengths, inputs, tag_bitmap in zip(
-            sequence_lengths_list, inputs_list, tag_bitmap_list):
-      sequence_score = text.crf_multitag_sequence_score(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_bitmap=tf.expand_dims(tag_bitmap, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      sequence_score = tf.squeeze(sequence_score, [0])
-      tf_sum_sequence_score = self.evaluate(sequence_score)
-      all_indices_list = [
-        single_index_bitmap.nonzero()[0]
-        for single_index_bitmap in tag_bitmap[:sequence_lengths]
-      ]
-      expected_sequence_scores = [
-        self.calculateSequenceScore(inputs, transition_params, indices,
-                                    sequence_lengths)
-        for indices in itertools.product(*all_indices_list)
-      ]
-      expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(
-        expected_sequence_scores)
-      self.assertAllClose(tf_sum_sequence_score,
-                          expected_log_sum_exp_sequence_scores)
-
-  def testCrfUnaryScore(self):
-    inputs = np.array(
-      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
-    for dtype in (np.int32, np.int64):
-      tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
-      sequence_lengths = np.array(3, dtype=np.int32)
-      unary_score = text.crf_unary_score(
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        inputs=tf.expand_dims(inputs, 0))
-      unary_score = tf.squeeze(unary_score, [0])
-      tf_unary_score = self.evaluate(unary_score)
-      expected_unary_score = sum(inputs[i][tag_indices[i]]
-                                 for i in range(sequence_lengths))
-      self.assertAllClose(tf_unary_score, expected_unary_score)
-
-  def testCrfBinaryScore(self):
-    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    binary_score = text.crf_binary_score(
-      tag_indices=tf.expand_dims(tag_indices, 0),
-      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-      transition_params=tf.constant(transition_params))
-    binary_score = tf.squeeze(binary_score, [0])
-    tf_binary_score = self.evaluate(binary_score)
-    expected_binary_score = sum(
-      transition_params[tag_indices[i], tag_indices[i + 1]]
-      for i in range(sequence_lengths - 1))
-    self.assertAllClose(tf_binary_score, expected_binary_score)
-
-  def testCrfLogNorm(self):
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    # Test both the length-1 and regular cases.
-    sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int64)
-    ]
-    inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
-               dtype=np.float32),
-      np.array([[3, -1, 3]],
-               dtype=np.float32),
-    ]
-    tag_indices_list = [
-      np.array([1, 2, 1, 0], dtype=np.int32),
-      np.array([2], dtype=np.int32)
-    ]
-
-    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
-                                                     inputs_list,
-                                                     tag_indices_list):
-      num_words = inputs.shape[0]
-      num_tags = inputs.shape[1]
-      all_sequence_scores = []
-
-      # Compare the dynamic program with brute force computation.
-      for tag_indices in itertools.product(
-              range(num_tags), repeat=sequence_lengths):
-        tag_indices = list(tag_indices)
-        tag_indices.extend([0] * (num_words - sequence_lengths))
-        all_sequence_scores.append(
-          text.crf_sequence_score(
-            inputs=tf.expand_dims(inputs, 0),
+    def calculateSequenceScore(self, inputs, transition_params, tag_indices,
+                               sequence_lengths):
+        expected_unary_score = sum(
+            inputs[i][tag_indices[i]] for i in range(sequence_lengths))
+        expected_binary_score = sum(
+            transition_params[tag_indices[i], tag_indices[i + 1]]
+            for i in range(sequence_lengths - 1))
+        return expected_unary_score + expected_binary_score
+
+    def testCrfSequenceScore(self):
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        # Test both the length-1 and regular cases.
+        sequence_lengths_list = [
+            np.array(3, dtype=np.int32),
+            np.array(1, dtype=np.int32)
+        ]
+        inputs_list = [
+            np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                     dtype=np.float32),
+            np.array([[4, 5, -3]], dtype=np.float32),
+        ]
+        tag_indices_list = [
+            np.array([1, 2, 1, 0], dtype=np.int32),
+            np.array([1], dtype=np.int32)
+        ]
+        for sequence_lengths, inputs, tag_indices in zip(
+                sequence_lengths_list, inputs_list, tag_indices_list):
+            sequence_score = text.crf_sequence_score(
+                inputs=tf.expand_dims(inputs, 0),
+                tag_indices=tf.expand_dims(tag_indices, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            sequence_score = tf.squeeze(sequence_score, [0])
+
+            tf_sequence_score = self.evaluate(sequence_score)
+
+            expected_sequence_score = self.calculateSequenceScore(
+                inputs, transition_params, tag_indices, sequence_lengths)
+            self.assertAllClose(tf_sequence_score, expected_sequence_score)
+
+    def testCrfMultiTagSequenceScore(self):
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        # Test both the length-1 and regular cases.
+        sequence_lengths_list = [
+            np.array(3, dtype=np.int32),
+            np.array(1, dtype=np.int32)
+        ]
+        inputs_list = [
+            np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                     dtype=np.float32),
+            np.array([[4, 5, -3]], dtype=np.float32),
+        ]
+        tag_bitmap_list = [
+            np.array([[True, True, False], [True, False, True],
+                      [False, True, True], [True, False, True]],
+                     dtype=np.bool),
+            np.array([[True, True, False]], dtype=np.bool)
+        ]
+        for sequence_lengths, inputs, tag_bitmap in zip(
+                sequence_lengths_list, inputs_list, tag_bitmap_list):
+            sequence_score = text.crf_multitag_sequence_score(
+                inputs=tf.expand_dims(inputs, 0),
+                tag_bitmap=tf.expand_dims(tag_bitmap, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            sequence_score = tf.squeeze(sequence_score, [0])
+            tf_sum_sequence_score = self.evaluate(sequence_score)
+            all_indices_list = [
+                single_index_bitmap.nonzero()[0]
+                for single_index_bitmap in tag_bitmap[:sequence_lengths]
+            ]
+            expected_sequence_scores = [
+                self.calculateSequenceScore(inputs, transition_params, indices,
+                                            sequence_lengths)
+                for indices in itertools.product(*all_indices_list)
+            ]
+            expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(
+                expected_sequence_scores)
+            self.assertAllClose(tf_sum_sequence_score,
+                                expected_log_sum_exp_sequence_scores)
+
+    def testCrfUnaryScore(self):
+        inputs = np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                          dtype=np.float32)
+        for dtype in (np.int32, np.int64):
+            tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
+            sequence_lengths = np.array(3, dtype=np.int32)
+            unary_score = text.crf_unary_score(
+                tag_indices=tf.expand_dims(tag_indices, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                inputs=tf.expand_dims(inputs, 0))
+            unary_score = tf.squeeze(unary_score, [0])
+            tf_unary_score = self.evaluate(unary_score)
+            expected_unary_score = sum(
+                inputs[i][tag_indices[i]] for i in range(sequence_lengths))
+            self.assertAllClose(tf_unary_score, expected_unary_score)
+
+    def testCrfBinaryScore(self):
+        tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        sequence_lengths = np.array(3, dtype=np.int32)
+        binary_score = text.crf_binary_score(
             tag_indices=tf.expand_dims(tag_indices, 0),
             sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-            transition_params=tf.constant(transition_params)))
-
-      brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
-      log_norm = text.crf_log_norm(
-        inputs=tf.expand_dims(inputs, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      log_norm = tf.squeeze(log_norm, [0])
-      tf_brute_force_log_norm, tf_log_norm = self.evaluate(
-        [brute_force_log_norm, log_norm])
-
-      self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
-
-  def testCrfLogNormZeroSeqLength(self):
-    """
-    Test `crf_log_norm` when `sequence_lengths` contains one or more zeros.
-    """
-    inputs = tf.constant(np.ones([2, 10, 5],
-                                 dtype=np.float32))
-    transition_params = tf.constant(np.ones([5, 5],
-                                            dtype=np.float32))
-    sequence_lengths = tf.constant(np.zeros([2],
-                                            dtype=np.int32))
-    expected_log_norm = np.zeros([2], dtype=np.float32)
-    log_norm = text.crf_log_norm(inputs, sequence_lengths, transition_params)
-    tf_log_norm = self.evaluate(log_norm)
-    self.assertAllClose(tf_log_norm, expected_log_norm)
-
-  def testCrfLogLikelihood(self):
-    inputs = np.array(
-      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    num_words = inputs.shape[0]
-    num_tags = inputs.shape[1]
-    all_sequence_log_likelihoods = []
-
-    # Make sure all probabilities sum to 1.
-    for tag_indices in itertools.product(
-            range(num_tags), repeat=sequence_lengths):
-      tag_indices = list(tag_indices)
-      tag_indices.extend([0] * (num_words - sequence_lengths))
-      sequence_log_likelihood, _ = text.crf_log_likelihood(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      all_sequence_log_likelihoods.append(sequence_log_likelihood)
-    total_log_likelihood = tf.reduce_logsumexp(
-      all_sequence_log_likelihoods)
-    tf_total_log_likelihood = self.evaluate(total_log_likelihood)
-    self.assertAllClose(tf_total_log_likelihood, 0.0)
-
-  def testViterbiDecode(self):
-    inputs = np.array(
-      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    num_words = inputs.shape[0]
-    num_tags = inputs.shape[1]
-
-    all_sequence_scores = []
-    all_sequences = []
-
-    # Compare the dynamic program with brute force computation.
-    for tag_indices in itertools.product(
-            range(num_tags), repeat=sequence_lengths):
-      tag_indices = list(tag_indices)
-      tag_indices.extend([0] * (num_words - sequence_lengths))
-      all_sequences.append(tag_indices)
-      sequence_score = text.crf_sequence_score(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      sequence_score = tf.squeeze(sequence_score, [0])
-      all_sequence_scores.append(sequence_score)
-
-    tf_all_sequence_scores = self.evaluate(all_sequence_scores)
-
-    expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
-    expected_max_sequence = all_sequences[expected_max_sequence_index]
-    expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
-
-    actual_max_sequence, actual_max_score = text.viterbi_decode(
-      inputs[:sequence_lengths], transition_params)
-
-    self.assertAllClose(actual_max_score, expected_max_score)
-    self.assertEqual(actual_max_sequence,
-                     expected_max_sequence[:sequence_lengths])
-
-  def testCrfDecode(self):
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    # Test both the length-1 and regular cases.
-    sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int64)
-    ]
-    inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
-               dtype=np.float32),
-      np.array([[-1, 2, 1]],
-               dtype=np.float32),
-    ]
-    tag_indices_list = [
-      np.array([1, 2, 1, 0], dtype=np.int32),
-      np.array([2], dtype=np.int32)
-    ]
-
-    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
-                                                     inputs_list,
-                                                     tag_indices_list):
-      num_words = inputs.shape[0]
-      num_tags = inputs.shape[1]
-
-      all_sequence_scores = []
-      all_sequences = []
-
-      # Compare the dynamic program with brute force computation.
-      for tag_indices in itertools.product(
-              range(num_tags), repeat=sequence_lengths):
-        tag_indices = list(tag_indices)
-        tag_indices.extend([0] * (num_words - sequence_lengths))
-        all_sequences.append(tag_indices)
-        sequence_score = text.crf_sequence_score(
-          inputs=tf.expand_dims(inputs, 0),
-          tag_indices=tf.expand_dims(tag_indices, 0),
-          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-          transition_params=tf.constant(transition_params))
-        sequence_score = tf.squeeze(sequence_score, [0])
-        all_sequence_scores.append(sequence_score)
-
-      tf_all_sequence_scores = self.evaluate(all_sequence_scores)
-
-      expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
-      expected_max_sequence = all_sequences[expected_max_sequence_index]
-      expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
-
-      actual_max_sequence, actual_max_score = text.crf_decode(
-        tf.expand_dims(inputs, 0),
-        tf.constant(transition_params),
-        tf.expand_dims(sequence_lengths, 0))
-      actual_max_sequence = tf.squeeze(actual_max_sequence, [0])
-      actual_max_score = tf.squeeze(actual_max_score, [0])
-      tf_actual_max_sequence, tf_actual_max_score = self.evaluate(
-        [actual_max_sequence, actual_max_score])
-
-      self.assertAllClose(tf_actual_max_score, expected_max_score)
-      self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
-                       expected_max_sequence[:sequence_lengths])
-
-  def testCrfDecodeZeroSeqLength(self):
-    """
-    Test that crf_decode works when sequence_length contains one or more zeros.
-    """
-    inputs = tf.constant(np.ones([2, 10, 5],
-                                 dtype=np.float32))
-    transition_params = tf.constant(np.ones([5, 5],
-                                            dtype=np.float32))
-    sequence_lengths = tf.constant(np.zeros([2],
-                                            dtype=np.int32))
-    tags, scores = text.crf_decode(inputs, transition_params, sequence_lengths)
-    tf_tags, tf_scores = self.evaluate([tags, scores])
-    self.assertEqual(len(tf_tags.shape), 2)
-    self.assertEqual(len(tf_scores.shape), 1)
+            transition_params=tf.constant(transition_params))
+        binary_score = tf.squeeze(binary_score, [0])
+        tf_binary_score = self.evaluate(binary_score)
+        expected_binary_score = sum(
+            transition_params[tag_indices[i], tag_indices[i + 1]]
+            for i in range(sequence_lengths - 1))
+        self.assertAllClose(tf_binary_score, expected_binary_score)
+
+    def testCrfLogNorm(self):
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        # Test both the length-1 and regular cases.
+        sequence_lengths_list = [
+            np.array(3, dtype=np.int32),
+            np.array(1, dtype=np.int64)
+        ]
+        inputs_list = [
+            np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                     dtype=np.float32),
+            np.array([[3, -1, 3]], dtype=np.float32),
+        ]
+        tag_indices_list = [
+            np.array([1, 2, 1, 0], dtype=np.int32),
+            np.array([2], dtype=np.int32)
+        ]
+
+        for sequence_lengths, inputs, tag_indices in zip(
+                sequence_lengths_list, inputs_list, tag_indices_list):
+            num_words = inputs.shape[0]
+            num_tags = inputs.shape[1]
+            all_sequence_scores = []
+
+            # Compare the dynamic program with brute force computation.
+            for tag_indices in itertools.product(
+                    range(num_tags), repeat=sequence_lengths):
+                tag_indices = list(tag_indices)
+                tag_indices.extend([0] * (num_words - sequence_lengths))
+                all_sequence_scores.append(
+                    text.crf_sequence_score(
+                        inputs=tf.expand_dims(inputs, 0),
+                        tag_indices=tf.expand_dims(tag_indices, 0),
+                        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                        transition_params=tf.constant(transition_params)))
+
+            brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
+            log_norm = text.crf_log_norm(
+                inputs=tf.expand_dims(inputs, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            log_norm = tf.squeeze(log_norm, [0])
+            tf_brute_force_log_norm, tf_log_norm = self.evaluate(
+                [brute_force_log_norm, log_norm])
+
+            self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
+
+    def testCrfLogNormZeroSeqLength(self):
+        """Test `crf_log_norm` when `sequence_lengths` contains one or more
+        zeros."""
+        inputs = tf.constant(np.ones([2, 10, 5], dtype=np.float32))
+        transition_params = tf.constant(np.ones([5, 5], dtype=np.float32))
+        sequence_lengths = tf.constant(np.zeros([2], dtype=np.int32))
+        expected_log_norm = np.zeros([2], dtype=np.float32)
+        log_norm = text.crf_log_norm(inputs, sequence_lengths,
+                                     transition_params)
+        tf_log_norm = self.evaluate(log_norm)
+        self.assertAllClose(tf_log_norm, expected_log_norm)
+
+    def testCrfLogLikelihood(self):
+        inputs = np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                          dtype=np.float32)
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        sequence_lengths = np.array(3, dtype=np.int32)
+        num_words = inputs.shape[0]
+        num_tags = inputs.shape[1]
+        all_sequence_log_likelihoods = []
+
+        # Make sure all probabilities sum to 1.
+        for tag_indices in itertools.product(
+                range(num_tags), repeat=sequence_lengths):
+            tag_indices = list(tag_indices)
+            tag_indices.extend([0] * (num_words - sequence_lengths))
+            sequence_log_likelihood, _ = text.crf_log_likelihood(
+                inputs=tf.expand_dims(inputs, 0),
+                tag_indices=tf.expand_dims(tag_indices, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            all_sequence_log_likelihoods.append(sequence_log_likelihood)
+        total_log_likelihood = tf.reduce_logsumexp(
+            all_sequence_log_likelihoods)
+        tf_total_log_likelihood = self.evaluate(total_log_likelihood)
+        self.assertAllClose(tf_total_log_likelihood, 0.0)
+
+    def testViterbiDecode(self):
+        inputs = np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                          dtype=np.float32)
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        sequence_lengths = np.array(3, dtype=np.int32)
+        num_words = inputs.shape[0]
+        num_tags = inputs.shape[1]
+
+        all_sequence_scores = []
+        all_sequences = []
+
+        # Compare the dynamic program with brute force computation.
+        for tag_indices in itertools.product(
+                range(num_tags), repeat=sequence_lengths):
+            tag_indices = list(tag_indices)
+            tag_indices.extend([0] * (num_words - sequence_lengths))
+            all_sequences.append(tag_indices)
+            sequence_score = text.crf_sequence_score(
+                inputs=tf.expand_dims(inputs, 0),
+                tag_indices=tf.expand_dims(tag_indices, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            sequence_score = tf.squeeze(sequence_score, [0])
+            all_sequence_scores.append(sequence_score)
+
+        tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+        expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+        expected_max_sequence = all_sequences[expected_max_sequence_index]
+        expected_max_score = tf_all_sequence_scores[
+            expected_max_sequence_index]
+
+        actual_max_sequence, actual_max_score = text.viterbi_decode(
+            inputs[:sequence_lengths], transition_params)
+
+        self.assertAllClose(actual_max_score, expected_max_score)
+        self.assertEqual(actual_max_sequence,
+                         expected_max_sequence[:sequence_lengths])
+
+    def testCrfDecode(self):
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        # Test both the length-1 and regular cases.
+        sequence_lengths_list = [
+            np.array(3, dtype=np.int32),
+            np.array(1, dtype=np.int64)
+        ]
+        inputs_list = [
+            np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                     dtype=np.float32),
+            np.array([[-1, 2, 1]], dtype=np.float32),
+        ]
+        tag_indices_list = [
+            np.array([1, 2, 1, 0], dtype=np.int32),
+            np.array([2], dtype=np.int32)
+        ]
+
+        for sequence_lengths, inputs, tag_indices in zip(
+                sequence_lengths_list, inputs_list, tag_indices_list):
+            num_words = inputs.shape[0]
+            num_tags = inputs.shape[1]
+
+            all_sequence_scores = []
+            all_sequences = []
+
+            # Compare the dynamic program with brute force computation.
+            for tag_indices in itertools.product(
+                    range(num_tags), repeat=sequence_lengths):
+                tag_indices = list(tag_indices)
+                tag_indices.extend([0] * (num_words - sequence_lengths))
+                all_sequences.append(tag_indices)
+                sequence_score = text.crf_sequence_score(
+                    inputs=tf.expand_dims(inputs, 0),
+                    tag_indices=tf.expand_dims(tag_indices, 0),
+                    sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                    transition_params=tf.constant(transition_params))
+                sequence_score = tf.squeeze(sequence_score, [0])
+                all_sequence_scores.append(sequence_score)
+
+            tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+            expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+            expected_max_sequence = all_sequences[expected_max_sequence_index]
+            expected_max_score = tf_all_sequence_scores[
+                expected_max_sequence_index]
+
+            actual_max_sequence, actual_max_score = text.crf_decode(
+                tf.expand_dims(inputs, 0), tf.constant(transition_params),
+                tf.expand_dims(sequence_lengths, 0))
+            actual_max_sequence = tf.squeeze(actual_max_sequence, [0])
+            actual_max_score = tf.squeeze(actual_max_score, [0])
+            tf_actual_max_sequence, tf_actual_max_score = self.evaluate(
+                [actual_max_sequence, actual_max_score])
+
+            self.assertAllClose(tf_actual_max_score, expected_max_score)
+            self.assertEqual(
+                list(tf_actual_max_sequence[:sequence_lengths]),
+                expected_max_sequence[:sequence_lengths])
+
+    def testCrfDecodeZeroSeqLength(self):
+        """Test that crf_decode works when sequence_length contains one or more
+        zeros."""
+        inputs = tf.constant(np.ones([2, 10, 5], dtype=np.float32))
+        transition_params = tf.constant(np.ones([5, 5], dtype=np.float32))
+        sequence_lengths = tf.constant(np.zeros([2], dtype=np.int32))
+        tags, scores = text.crf_decode(inputs, transition_params,
+                                       sequence_lengths)
+        tf_tags, tf_scores = self.evaluate([tags, scores])
+        self.assertEqual(len(tf_tags.shape), 2)
+        self.assertEqual(len(tf_scores.shape), 1)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()

From 829ac65004ebb56d5c86f766b8609b850f895fb1 Mon Sep 17 00:00:00 2001
From: Dheeraj Rajaram Reddy <dheeraj98reddy@gmail.com>
Date: Fri, 21 Jun 2019 13:34:43 +0530
Subject: [PATCH 05/52] Add tf.function to all the CRF functions

---
 tensorflow_addons/text/crf_ops.py      | 7 +++++++
 tensorflow_addons/text/crf_ops_test.py | 1 +
 2 files changed, 8 insertions(+)

diff --git a/tensorflow_addons/text/crf_ops.py b/tensorflow_addons/text/crf_ops.py
index 9e5fd02051..0d77f94207 100644
--- a/tensorflow_addons/text/crf_ops.py
+++ b/tensorflow_addons/text/crf_ops.py
@@ -21,6 +21,7 @@
 import tensorflow as tf
 
 
+@tf.function
 def crf_sequence_score(inputs, tag_indices, sequence_lengths,
                        transition_params):
     """Computes the unnormalized score for a tag sequence.
@@ -65,6 +66,7 @@ def _multi_seq_fn():
         return _multi_seq_fn()
 
 
+@tf.function
 def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
                                 transition_params):
     """Computes the unnormalized score of all tag sequences matching
@@ -113,6 +115,7 @@ def _multi_seq_fn():
         return _multi_seq_fn()
 
 
+@tf.function
 def crf_log_norm(inputs, sequence_lengths, transition_params):
     """Computes the normalization for a CRF.
 
@@ -167,6 +170,7 @@ def _multi_seq_fn():
         return _multi_seq_fn()
 
 
+@tf.function
 def crf_log_likelihood(inputs,
                        tag_indices,
                        sequence_lengths,
@@ -203,6 +207,7 @@ def crf_log_likelihood(inputs,
     return log_likelihood, transition_params
 
 
+@tf.function
 def crf_unary_score(tag_indices, sequence_lengths, inputs):
     """Computes the unary scores of tag sequences.
 
@@ -237,6 +242,7 @@ def crf_unary_score(tag_indices, sequence_lengths, inputs):
     return unary_scores
 
 
+@tf.function
 def crf_binary_score(tag_indices, sequence_lengths, transition_params):
     """Computes the binary scores of tag sequences.
 
@@ -379,6 +385,7 @@ def call(self, inputs, state, training=None):
         return new_tags, new_tags
 
 
+@tf.function
 def crf_decode(potentials, transition_params, sequence_length):
     """Decode the highest scoring sequence of tags in TensorFlow.
 
diff --git a/tensorflow_addons/text/crf_ops_test.py b/tensorflow_addons/text/crf_ops_test.py
index ad22d95a2f..84c09b539b 100644
--- a/tensorflow_addons/text/crf_ops_test.py
+++ b/tensorflow_addons/text/crf_ops_test.py
@@ -27,6 +27,7 @@
 from tensorflow_addons.utils import test_utils
 
 
+@test_utils.run_all_in_graph_and_eager_modes
 class CrfTest(tf.test.TestCase):
     def calculateSequenceScore(self, inputs, transition_params, tag_indices,
                                sequence_lengths):

From 9140ce17b7a1062aa818c44b563b779269cc3a53 Mon Sep 17 00:00:00 2001
From: "Dheeraj R. Reddy" <dheeraj98reddy@gmail.com>
Date: Mon, 24 Jun 2019 19:03:14 +0530
Subject: [PATCH 06/52] RNN call masks computation based on seq len

---
 tensorflow_addons/text/crf_ops.py      | 766 ++++++++++++-------------
 tensorflow_addons/text/crf_ops_test.py | 634 ++++++++++----------
 2 files changed, 693 insertions(+), 707 deletions(-)

diff --git a/tensorflow_addons/text/crf_ops.py b/tensorflow_addons/text/crf_ops.py
index 7acd10924a..90f595e125 100644
--- a/tensorflow_addons/text/crf_ops.py
+++ b/tensorflow_addons/text/crf_ops.py
@@ -23,442 +23,440 @@
 
 def crf_sequence_score(inputs, tag_indices, sequence_lengths,
                        transition_params):
-  """Computes the unnormalized score for a tag sequence.
-
-  Args:
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-        to use as input to the CRF layer.
-    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
-        compute the unnormalized score.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] transition matrix.
-  Returns:
-    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
-  """
-
-  # If max_seq_len is 1, we skip the score calculation and simply gather the
-  # unary potentials of the single tag.
-  def _single_seq_fn():
-    batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
-
-    example_inds = tf.reshape(
-      tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
-    sequence_scores = tf.gather_nd(
-      tf.squeeze(inputs, [1]),
-      tf.concat([example_inds, tag_indices], axis=1))
-    sequence_scores = tf.where(tf.less_equal(sequence_lengths, 0),
-                               tf.zeros_like(sequence_scores),
-                               sequence_scores)
-    return sequence_scores
-
-  def _multi_seq_fn():
-    # Compute the scores of the given tag sequence.
-    unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
-    binary_scores = crf_binary_score(tag_indices, sequence_lengths,
-                                     transition_params)
-    sequence_scores = unary_scores + binary_scores
-    return sequence_scores
-
-  if inputs.shape[1] == 1:
-    return _single_seq_fn()
-  else:
-    return _multi_seq_fn()
+    """Computes the unnormalized score for a tag sequence.
+
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
+          we compute the unnormalized score.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix.
+    Returns:
+      sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+    """
+
+    # If max_seq_len is 1, we skip the score calculation and simply gather the
+    # unary potentials of the single tag.
+    def _single_seq_fn():
+        batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
+
+        example_inds = tf.reshape(
+            tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
+        sequence_scores = tf.gather_nd(
+            tf.squeeze(inputs, [1]),
+            tf.concat([example_inds, tag_indices], axis=1))
+        sequence_scores = tf.where(
+            tf.less_equal(sequence_lengths, 0), tf.zeros_like(sequence_scores),
+            sequence_scores)
+        return sequence_scores
+
+    def _multi_seq_fn():
+        # Compute the scores of the given tag sequence.
+        unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
+        binary_scores = crf_binary_score(tag_indices, sequence_lengths,
+                                         transition_params)
+        sequence_scores = unary_scores + binary_scores
+        return sequence_scores
+
+    if inputs.shape[1] == 1:
+        return _single_seq_fn()
+    else:
+        return _multi_seq_fn()
 
 
 def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
                                 transition_params):
-  """Computes the unnormalized score of all tag sequences matching tag_bitmap.
-
-  tag_bitmap enables more than one tag to be considered correct at each time
-  step. This is useful when an observed output at a given time step is
-  consistent with more than one tag, and thus the log likelihood of that
-  observation must take into account all possible consistent tags.
-
-  Using one-hot vectors in tag_bitmap gives results identical to
-  crf_sequence_score.
-
-  Args:
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-        to use as input to the CRF layer.
-    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
-        representing all active tags at each index for which to calculate the
-        unnormalized score.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] transition matrix.
-  Returns:
-    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
-  """
-
-  # If max_seq_len is 1, we skip the score calculation and simply gather the
-  # unary potentials of all active tags.
-  def _single_seq_fn():
-    filtered_inputs = tf.where(
-      tag_bitmap, inputs,
-      tf.fill(tf.shape(inputs), float("-inf")))
-    return tf.reduce_logsumexp(
-      filtered_inputs, axis=[1, 2], keepdims=False)
-
-  def _multi_seq_fn():
-    # Compute the logsumexp of all scores of sequences matching the given tags.
-    filtered_inputs = tf.where(
-      tag_bitmap, inputs,
-      tf.fill(tf.shape(inputs), float("-inf")))
-    return crf_log_norm(
-      inputs=filtered_inputs,
-      sequence_lengths=sequence_lengths,
-      transition_params=transition_params)
-
-  if inputs.shape[1] == 1:
-    return _single_seq_fn()
-  else:
-    return _multi_seq_fn()
+    """Computes the unnormalized score of all tag sequences matching
+    tag_bitmap.
+
+    tag_bitmap enables more than one tag to be considered correct at each time
+    step. This is useful when an observed output at a given time step is
+    consistent with more than one tag, and thus the log likelihood of that
+    observation must take into account all possible consistent tags.
+
+    Using one-hot vectors in tag_bitmap gives results identical to
+    crf_sequence_score.
+
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+          representing all active tags at each index for which to calculate the
+          unnormalized score.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix.
+    Returns:
+      sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+    """
+
+    # If max_seq_len is 1, we skip the score calculation and simply gather the
+    # unary potentials of all active tags.
+    def _single_seq_fn():
+        filtered_inputs = tf.where(tag_bitmap, inputs,
+                                   tf.fill(tf.shape(inputs), float("-inf")))
+        return tf.reduce_logsumexp(
+            filtered_inputs, axis=[1, 2], keepdims=False)
+
+    def _multi_seq_fn():
+        # Compute the logsumexp of all scores of sequences matching the given tags.
+        filtered_inputs = tf.where(tag_bitmap, inputs,
+                                   tf.fill(tf.shape(inputs), float("-inf")))
+        return crf_log_norm(
+            inputs=filtered_inputs,
+            sequence_lengths=sequence_lengths,
+            transition_params=transition_params)
+
+    if inputs.shape[1] == 1:
+        return _single_seq_fn()
+    else:
+        return _multi_seq_fn()
 
 
 def crf_log_norm(inputs, sequence_lengths, transition_params):
-  """Computes the normalization for a CRF.
-
-  Args:
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-        to use as input to the CRF layer.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] transition matrix.
-  Returns:
-    log_norm: A [batch_size] vector of normalizers for a CRF.
-  """
-  # Split up the first and rest of the inputs in preparation for the forward
-  # algorithm.
-  first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
-  first_input = tf.squeeze(first_input, [1])
-
-  # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
-  # the "initial state" (the unary potentials).
-  def _single_seq_fn():
-    log_norm = tf.reduce_logsumexp(first_input, [1])
-    # Mask `log_norm` of the sequences with length <= zero.
-    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
-                        tf.zeros_like(log_norm),
-                        log_norm)
-    return log_norm
-
-  def _multi_seq_fn():
-    """Forward computation of alpha values."""
-    rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
-    # Compute the alpha values in the forward algorithm in order to get the
-    # partition function.
-    forward_cell = CrfForwardRnnCell(transition_params)
-    # Sequence length is not allowed to be less than zero.
-    sequence_lengths_less_one = tf.maximum(
-      tf.constant(0, dtype=sequence_lengths.dtype),
-      sequence_lengths - 1)
-
-    forward_layer = tf.keras.layers.RNN(
-      forward_cell,
-      return_sequences=True,
-      return_state=True)
-
-    _, alphas = forward_layer(rest_of_input, first_input)
-
-    log_norm = tf.reduce_logsumexp(alphas, [1])
-    # Mask `log_norm` of the sequences with length <= zero.
-    log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
-                        tf.zeros_like(log_norm),
-                        log_norm)
-    return log_norm
-
-  if inputs.shape[1] == 1:
-    return _single_seq_fn()
-  else:
-    return _multi_seq_fn()
+    """Computes the normalization for a CRF.
+
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix.
+    Returns:
+      log_norm: A [batch_size] vector of normalizers for a CRF.
+    """
+    # Split up the first and rest of the inputs in preparation for the forward
+    # algorithm.
+    first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
+    first_input = tf.squeeze(first_input, [1])
+
+    # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
+    # the "initial state" (the unary potentials).
+    def _single_seq_fn():
+        log_norm = tf.reduce_logsumexp(first_input, [1])
+        # Mask `log_norm` of the sequences with length <= zero.
+        log_norm = tf.where(
+            tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm),
+            log_norm)
+        return log_norm
+
+    def _multi_seq_fn():
+        """Forward computation of alpha values."""
+        rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
+        # Compute the alpha values in the forward algorithm in order to get the
+        # partition function.
+        forward_cell = CrfForwardRnnCell(transition_params)
+        # Sequence length is not allowed to be less than zero.
+        sequence_lengths_less_one = tf.maximum(
+            tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1)
+
+        forward_layer = tf.keras.layers.RNN(
+            forward_cell, return_sequences=True, return_state=True)
+
+        mask = tf.sequence_mask(sequence_lengths_less_one,
+                                tf.shape(inputs)[1] - 1)
+        _, alphas = forward_layer(rest_of_input, first_input, mask=mask)
+        log_norm = tf.reduce_logsumexp(alphas, [1])
+        # Mask `log_norm` of the sequences with length <= zero.
+        log_norm = tf.where(
+            tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm),
+            log_norm)
+        return log_norm
+
+    if inputs.shape[1] == 1:
+        return _single_seq_fn()
+    else:
+        return _multi_seq_fn()
 
 
 def crf_log_likelihood(inputs,
                        tag_indices,
                        sequence_lengths,
                        transition_params=None):
-  """Computes the log-likelihood of tag sequences in a CRF.
-
-  Args:
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-        to use as input to the CRF layer.
-    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
-        compute the log-likelihood.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] transition matrix, if available.
-  Returns:
-    log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
-      each example, given the sequence of tag indices.
-    transition_params: A [num_tags, num_tags] transition matrix. This is either
-        provided by the caller or created in this function.
-  """
-  # Get shape information.
-  num_tags = inputs.shape[2]
-
-  # Get the transition matrix if not provided.
-  if transition_params is None:
-    transition_params = tf.get_variable("transitions", [num_tags, num_tags])
-
-  sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
-                                       transition_params)
-  log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
-
-  # Normalize the scores to get the log-likelihood per example.
-  log_likelihood = sequence_scores - log_norm
-  return log_likelihood, transition_params
+    """Computes the log-likelihood of tag sequences in a CRF.
 
+    Args:
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+          to use as input to the CRF layer.
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
+          we compute the log-likelihood.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] transition matrix,
+          if available.
+    Returns:
+      log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
+        each example, given the sequence of tag indices.
+      transition_params: A [num_tags, num_tags] transition matrix. This is
+          either provided by the caller or created in this function.
+    """
+    # Get shape information.
+    num_tags = inputs.shape[2]
 
-def crf_unary_score(tag_indices, sequence_lengths, inputs):
-  """Computes the unary scores of tag sequences.
+    # Get the transition matrix if not provided.
+    if transition_params is None:
+        transition_params = tf.get_variable("transitions",
+                                            [num_tags, num_tags])
 
-  Args:
-    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
-  Returns:
-    unary_scores: A [batch_size] vector of unary scores.
-  """
-  batch_size = tf.shape(inputs)[0]
-  max_seq_len = tf.shape(inputs)[1]
-  num_tags = tf.shape(inputs)[2]
+    sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                                         transition_params)
+    log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
 
-  flattened_inputs = tf.reshape(inputs, [-1])
+    # Normalize the scores to get the log-likelihood per example.
+    log_likelihood = sequence_scores - log_norm
+    return log_likelihood, transition_params
 
-  offsets = tf.expand_dims(
-    tf.range(batch_size) * max_seq_len * num_tags, 1)
-  offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
-  # Use int32 or int64 based on tag_indices' dtype.
-  if tag_indices.dtype == tf.int64:
-    offsets = tf.cast(offsets, tf.int64)
-  flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])
 
-  unary_scores = tf.reshape(
-    tf.gather(flattened_inputs, flattened_tag_indices),
-    [batch_size, max_seq_len])
+def crf_unary_score(tag_indices, sequence_lengths, inputs):
+    """Computes the unary scores of tag sequences.
 
-  masks = tf.sequence_mask(sequence_lengths,
-                           maxlen=tf.shape(tag_indices)[1],
-                           dtype=tf.float32)
+    Args:
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+    Returns:
+      unary_scores: A [batch_size] vector of unary scores.
+    """
+    batch_size = tf.shape(inputs)[0]
+    max_seq_len = tf.shape(inputs)[1]
+    num_tags = tf.shape(inputs)[2]
 
-  unary_scores = tf.reduce_sum(unary_scores * masks, 1)
-  return unary_scores
+    flattened_inputs = tf.reshape(inputs, [-1])
 
+    offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1)
+    offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
+    # Use int32 or int64 based on tag_indices' dtype.
+    if tag_indices.dtype == tf.int64:
+        offsets = tf.cast(offsets, tf.int64)
+    flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])
 
-def crf_binary_score(tag_indices, sequence_lengths, transition_params):
-  """Computes the binary scores of tag sequences.
-
-  Args:
-    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
-    sequence_lengths: A [batch_size] vector of true sequence lengths.
-    transition_params: A [num_tags, num_tags] matrix of binary potentials.
-  Returns:
-    binary_scores: A [batch_size] vector of binary scores.
-  """
-  # Get shape information.
-  num_tags = tf.shape(transition_params)[0]
-  num_transitions = tf.shape(tag_indices)[1] - 1
-
-  # Truncate by one on each side of the sequence to get the start and end
-  # indices of each transition.
-  start_tag_indices = tf.slice(tag_indices, [0, 0],
-                               [-1, num_transitions])
-  end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])
-
-  # Encode the indices in a flattened representation.
-  flattened_transition_indices = start_tag_indices * num_tags + end_tag_indices
-  flattened_transition_params = tf.reshape(transition_params, [-1])
-
-  # Get the binary scores based on the flattened representation.
-  binary_scores = tf.gather(flattened_transition_params,
-                            flattened_transition_indices)
-
-  masks = tf.sequence_mask(sequence_lengths,
-                           maxlen=tf.shape(tag_indices)[1],
-                           dtype=tf.float32)
-  truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
-  binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
-  return binary_scores
+    unary_scores = tf.reshape(
+        tf.gather(flattened_inputs, flattened_tag_indices),
+        [batch_size, max_seq_len])
 
+    masks = tf.sequence_mask(
+        sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32)
 
-class CrfForwardRnnCell(tf.keras.layers.Layer):
-  def __init__(self, transition_params, **kwargs):
-    super(CrfForwardRnnCell, self).__init__(**kwargs)
-    self._transition_params = tf.expand_dims(transition_params, 0)
-    self._num_tags = transition_params.shape[0]
-    self.state_size = self._num_tags
-    self.output_size = self._num_tags
+    unary_scores = tf.reduce_sum(unary_scores * masks, 1)
+    return unary_scores
 
-  def build(self, input_shape):
-    super(CrfForwardRnnCell, self).build(input_shape)
 
-  def call(self, inputs, state, training=None):
-    state = tf.expand_dims(state[0], 2)
-    transition_scores = state + self._transition_params
-    new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
-    return new_alphas, new_alphas
+def crf_binary_score(tag_indices, sequence_lengths, transition_params):
+    """Computes the binary scores of tag sequences.
 
+    Args:
+      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+      sequence_lengths: A [batch_size] vector of true sequence lengths.
+      transition_params: A [num_tags, num_tags] matrix of binary potentials.
+    Returns:
+      binary_scores: A [batch_size] vector of binary scores.
+    """
+    # Get shape information.
+    num_tags = tf.shape(transition_params)[0]
+    num_transitions = tf.shape(tag_indices)[1] - 1
 
-def viterbi_decode(score, transition_params):
-  """Decode the highest scoring sequence of tags outside of TensorFlow.
+    # Truncate by one on each side of the sequence to get the start and end
+    # indices of each transition.
+    start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions])
+    end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])
 
-  This should only be used at test time.
+    # Encode the indices in a flattened representation.
+    flattened_transition_indices = start_tag_indices * \
+        num_tags + end_tag_indices
+    flattened_transition_params = tf.reshape(transition_params, [-1])
 
-  Args:
-    score: A [seq_len, num_tags] matrix of unary potentials.
-    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+    # Get the binary scores based on the flattened representation.
+    binary_scores = tf.gather(flattened_transition_params,
+                              flattened_transition_indices)
 
-  Returns:
-    viterbi: A [seq_len] list of integers containing the highest scoring tag
-        indices.
-    viterbi_score: A float containing the score for the Viterbi sequence.
-  """
-  trellis = np.zeros_like(score)
-  backpointers = np.zeros_like(score, dtype=np.int32)
-  trellis[0] = score[0]
+    masks = tf.sequence_mask(
+        sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32)
+    truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
+    binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
+    return binary_scores
 
-  for t in range(1, score.shape[0]):
-    v = np.expand_dims(trellis[t - 1], 1) + transition_params
-    trellis[t] = score[t] + np.max(v, 0)
-    backpointers[t] = np.argmax(v, 0)
 
-  viterbi = [np.argmax(trellis[-1])]
-  for bp in reversed(backpointers[1:]):
-    viterbi.append(bp[viterbi[-1]])
-  viterbi.reverse()
+class CrfForwardRnnCell(tf.keras.layers.Layer):
+    def __init__(self, transition_params, **kwargs):
+        super(CrfForwardRnnCell, self).__init__(**kwargs)
+        self._transition_params = tf.expand_dims(transition_params, 0)
+        self._num_tags = transition_params.shape[0]
+        self.state_size = self._num_tags
+        self.output_size = self._num_tags
 
-  viterbi_score = np.max(trellis[-1])
-  return viterbi, viterbi_score
+    def build(self, input_shape):
+        super(CrfForwardRnnCell, self).build(input_shape)
 
+    def call(self, inputs, state, training=None):
+        state = tf.expand_dims(state[0], 2)
+        transition_scores = state + self._transition_params
+        new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
+        return new_alphas, new_alphas
 
-class CrfDecodeForwardRnnCell(tf.keras.layers.Layer):
-  """Computes the forward decoding in a linear-chain CRF.
-  """
 
-  def __init__(self, transition_params, **kwargs):
-    """Initialize the CrfDecodeForwardRnnCell.
+def viterbi_decode(score, transition_params):
+    """Decode the highest scoring sequence of tags outside of TensorFlow.
+
+    This should only be used at test time.
 
     Args:
-      transition_params: A [num_tags, num_tags] matrix of binary
-        potentials. This matrix is expanded into a
-        [1, num_tags, num_tags] in preparation for the broadcast
-        summation occurring within the cell.
+      score: A [seq_len, num_tags] matrix of unary potentials.
+      transition_params: A [num_tags, num_tags] matrix of binary potentials.
+
+    Returns:
+      viterbi: A [seq_len] list of integers containing the highest scoring tag
+          indices.
+      viterbi_score: A float containing the score for the Viterbi sequence.
     """
-    super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
-    self._transition_params = tf.expand_dims(transition_params, 0)
-    self._num_tags = transition_params.shape[0]
-    self.state_size = self._num_tags
-    self.output_size = self._num_tags
+    trellis = np.zeros_like(score)
+    backpointers = np.zeros_like(score, dtype=np.int32)
+    trellis[0] = score[0]
 
-  def build(self, input_shape):
-    super(CrfDecodeForwardRnnCell, self).build(input_shape)
+    for t in range(1, score.shape[0]):
+        v = np.expand_dims(trellis[t - 1], 1) + transition_params
+        trellis[t] = score[t] + np.max(v, 0)
+        backpointers[t] = np.argmax(v, 0)
 
-  def call(self, inputs, state, training=None):
-    state = tf.expand_dims(state[0], 2)
-    transition_scores = state + self._transition_params
-    new_state = inputs + tf.reduce_max(transition_scores, [1])
-    backpointers = tf.argmax(transition_scores, 1)
-    backpointers = tf.cast(backpointers, dtype=tf.int32)
-    return backpointers, new_state
+    viterbi = [np.argmax(trellis[-1])]
+    for bp in reversed(backpointers[1:]):
+        viterbi.append(bp[viterbi[-1]])
+    viterbi.reverse()
+
+    viterbi_score = np.max(trellis[-1])
+    return viterbi, viterbi_score
+
+
+class CrfDecodeForwardRnnCell(tf.keras.layers.Layer):
+    """Computes the forward decoding in a linear-chain CRF."""
+
+    def __init__(self, transition_params, **kwargs):
+        """Initialize the CrfDecodeForwardRnnCell.
+
+        Args:
+          transition_params: A [num_tags, num_tags] matrix of binary
+            potentials. This matrix is expanded into a
+            [1, num_tags, num_tags] in preparation for the broadcast
+            summation occurring within the cell.
+        """
+        super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
+        self._transition_params = tf.expand_dims(transition_params, 0)
+        self._num_tags = transition_params.shape[0]
+        self.state_size = self._num_tags
+        self.output_size = self._num_tags
+
+    def build(self, input_shape):
+        super(CrfDecodeForwardRnnCell, self).build(input_shape)
+
+    def call(self, inputs, state, training=None):
+        state = tf.expand_dims(state[0], 2)
+        transition_scores = state + self._transition_params
+        new_state = inputs + tf.reduce_max(transition_scores, [1])
+        backpointers = tf.argmax(transition_scores, 1)
+        backpointers = tf.cast(backpointers, dtype=tf.int32)
+        return backpointers, new_state
 
 
 class CrfDecodeBackwardRnnCell(tf.keras.layers.Layer):
-  """Computes backward decoding in a linear-chain CRF.
-  """
+    """Computes backward decoding in a linear-chain CRF."""
 
-  def __init__(self, num_tags, **kwargs):
-    """Initialize the CrfDecodeBackwardRnnCell.
+    def __init__(self, num_tags, **kwargs):
+        """Initialize the CrfDecodeBackwardRnnCell.
 
-    Args:
-      num_tags: An integer. The number of tags.
-    """
-    super(CrfDecodeBackwardRnnCell, self).__init__(**kwargs)
-    self._num_tags = num_tags
+        Args:
+          num_tags: An integer. The number of tags.
+        """
+        super(CrfDecodeBackwardRnnCell, self).__init__(**kwargs)
+        self._num_tags = num_tags
 
-    self.state_size = 1
-    self.output_size = 1
+        self.state_size = 1
+        self.output_size = 1
 
-  def build(self, input_shape):
-    super(CrfDecodeBackwardRnnCell, self).build(input_shape)
+    def build(self, input_shape):
+        super(CrfDecodeBackwardRnnCell, self).build(input_shape)
 
-  def call(self, inputs, state, training=None):
-    state = tf.squeeze(state[0], axis=[1])
-    batch_size = tf.shape(inputs)[0]
-    b_indices = tf.range(batch_size)
-    indices = tf.stack([b_indices, state], axis=1)
-    new_tags = tf.expand_dims(tf.gather_nd(inputs, indices), axis=-1)
+    def call(self, inputs, state, training=None):
+        state = tf.squeeze(state[0], axis=[1])
+        batch_size = tf.shape(inputs)[0]
+        b_indices = tf.range(batch_size)
+        indices = tf.stack([b_indices, state], axis=1)
+        new_tags = tf.expand_dims(tf.gather_nd(inputs, indices), axis=-1)
 
-    return new_tags, new_tags
+        return new_tags, new_tags
 
 
 def crf_decode(potentials, transition_params, sequence_length):
-  """Decode the highest scoring sequence of tags in TensorFlow.
-
-  This is a function for tensor.
-
-  Args:
-    potentials: A [batch_size, max_seq_len, num_tags] tensor of
-              unary potentials.
-    transition_params: A [num_tags, num_tags] matrix of
-              binary potentials.
-    sequence_length: A [batch_size] vector of true sequence lengths.
-
-  Returns:
-    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
-                Contains the highest scoring tag indices.
-    best_score: A [batch_size] vector, containing the score of `decode_tags`.
-  """
-
-  # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
-  # and the max activation.
-  def _single_seq_fn():
-    squeezed_potentials = tf.squeeze(potentials, [1])
-    decode_tags = tf.expand_dims(
-      tf.argmax(squeezed_potentials, axis=1), 1)
-    best_score = tf.reduce_max(squeezed_potentials, axis=1)
-    return tf.cast(decode_tags, dtype=tf.int32), best_score
-
-  def _multi_seq_fn():
-    """Decoding of highest scoring sequence."""
-
-    # For simplicity, in shape comments, denote:
-    # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
-    num_tags = potentials.shape[2]
-
-    # Computes forward decoding. Get last score and backpointers.
-    initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1])
-    initial_state = tf.squeeze(initial_state, axis=[1])  # [B, O]
-    inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
-    # Sequence length is not allowed to be less than zero.
-
-    sequence_length_less_one = tf.maximum(
-      tf.constant(0, dtype=sequence_length.dtype),
-      sequence_length - 1)
-
-    crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
-    crf_fwd_layer = tf.keras.layers.RNN(crf_fwd_cell,
-                                        return_sequences=True,
-                                        return_state=True,
-                                        time_major=False)
-    backpointers, last_score = crf_fwd_layer(inputs, initial_state)
-    backpointers = tf.reverse_sequence(backpointers, sequence_length_less_one, seq_axis=1)
-
-    crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
-    initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32)
-    initial_state = tf.expand_dims(initial_state, axis=-1)
-    crf_bwd_layer = tf.keras.layers.RNN(crf_bwd_cell,
-                                        return_sequences=True,
-                                        return_state=True,
-                                        time_major=False)
-    decode_tags, _ = crf_bwd_layer(backpointers, initial_state)
-
-    decode_tags = tf.squeeze(decode_tags, axis=[2])  # [B, T - 1]
-    decode_tags = tf.concat([initial_state, decode_tags],  # [B, T]
-                            axis=1)
-    decode_tags = tf.reverse_sequence(  # [B, T]
-      decode_tags, sequence_length, seq_axis=1)
-
-    best_score = tf.reduce_max(last_score, axis=1)  # [B]
-    return decode_tags, best_score
-
-  if potentials.shape[1] == 1:
-    return _single_seq_fn()
-  else:
-    return _multi_seq_fn()
+    """Decode the highest scoring sequence of tags in TensorFlow.
+
+    This is a function for tensor.
+
+    Args:
+      potentials: A [batch_size, max_seq_len, num_tags] tensor of
+                unary potentials.
+      transition_params: A [num_tags, num_tags] matrix of
+                binary potentials.
+      sequence_length: A [batch_size] vector of true sequence lengths.
+
+    Returns:
+      decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
+                  Contains the highest scoring tag indices.
+      best_score: A [batch_size] vector, containing the score of `decode_tags`.
+    """
+
+    # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
+    # and the max activation.
+    def _single_seq_fn():
+        squeezed_potentials = tf.squeeze(potentials, [1])
+        decode_tags = tf.expand_dims(tf.argmax(squeezed_potentials, axis=1), 1)
+        best_score = tf.reduce_max(squeezed_potentials, axis=1)
+        return tf.cast(decode_tags, dtype=tf.int32), best_score
+
+    def _multi_seq_fn():
+        """Decoding of highest scoring sequence."""
+
+        # For simplicity, in shape comments, denote:
+        # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
+        num_tags = potentials.shape[2]
+
+        # Computes forward decoding. Get last score and backpointers.
+        initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1])
+        initial_state = tf.squeeze(initial_state, axis=[1])  # [B, O]
+        inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+        # Sequence length is not allowed to be less than zero.
+
+        sequence_length_less_one = tf.maximum(
+            tf.constant(0, dtype=sequence_length.dtype), sequence_length - 1)
+
+        mask = tf.sequence_mask(sequence_length_less_one, tf.shape(inputs)[1])
+        crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
+        crf_fwd_layer = tf.keras.layers.RNN(
+            crf_fwd_cell,
+            return_sequences=True,
+            return_state=True,
+            time_major=False)
+        backpointers, last_score = crf_fwd_layer(
+            inputs, initial_state, mask=mask)
+        backpointers = tf.reverse_sequence(
+            backpointers, sequence_length_less_one, seq_axis=1)
+
+        crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
+        initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32)
+        initial_state = tf.expand_dims(initial_state, axis=-1)
+        crf_bwd_layer = tf.keras.layers.RNN(
+            crf_bwd_cell,
+            return_sequences=True,
+            return_state=True,
+            time_major=False)
+        decode_tags, _ = crf_bwd_layer(backpointers, initial_state)
+
+        decode_tags = tf.squeeze(decode_tags, axis=[2])  # [B, T - 1]
+        decode_tags = tf.concat(
+            [initial_state, decode_tags],  # [B, T]
+            axis=1)
+        decode_tags = tf.reverse_sequence(  # [B, T]
+            decode_tags, sequence_length, seq_axis=1)
+
+        best_score = tf.reduce_max(last_score, axis=1)  # [B]
+        return decode_tags, best_score
+
+    if potentials.shape[1] == 1:
+        return _single_seq_fn()
+    else:
+        return _multi_seq_fn()
diff --git a/tensorflow_addons/text/crf_ops_test.py b/tensorflow_addons/text/crf_ops_test.py
index d706992c32..84c09b539b 100644
--- a/tensorflow_addons/text/crf_ops_test.py
+++ b/tensorflow_addons/text/crf_ops_test.py
@@ -27,332 +27,320 @@
 from tensorflow_addons.utils import test_utils
 
 
+@test_utils.run_all_in_graph_and_eager_modes
 class CrfTest(tf.test.TestCase):
-
-  def calculateSequenceScore(self, inputs, transition_params, tag_indices,
-                             sequence_lengths):
-    expected_unary_score = sum(
-      inputs[i][tag_indices[i]] for i in range(sequence_lengths))
-    expected_binary_score = sum(
-      transition_params[tag_indices[i], tag_indices[i + 1]]
-      for i in range(sequence_lengths - 1))
-    return expected_unary_score + expected_binary_score
-
-  def testCrfSequenceScore(self):
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    # Test both the length-1 and regular cases.
-    sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int32)
-    ]
-    inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
-               dtype=np.float32),
-      np.array([[4, 5, -3]],
-               dtype=np.float32),
-    ]
-    tag_indices_list = [
-      np.array([1, 2, 1, 0], dtype=np.int32),
-      np.array([1], dtype=np.int32)
-    ]
-    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
-                                                     inputs_list,
-                                                     tag_indices_list):
-      sequence_score = text.crf_sequence_score(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      sequence_score = tf.squeeze(sequence_score, [0])
-
-      tf_sequence_score = self.evaluate(sequence_score)
-
-      expected_sequence_score = self.calculateSequenceScore(
-        inputs, transition_params, tag_indices, sequence_lengths)
-      self.assertAllClose(tf_sequence_score, expected_sequence_score)
-
-  def testCrfMultiTagSequenceScore(self):
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    # Test both the length-1 and regular cases.
-    sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int32)
-    ]
-    inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
-               dtype=np.float32),
-      np.array([[4, 5, -3]],
-               dtype=np.float32),
-    ]
-    tag_bitmap_list = [
-      np.array(
-        [[True, True, False], [True, False, True], [False, True, True],
-         [True, False, True]],
-        dtype=np.bool),
-      np.array([[True, True, False]], dtype=np.bool)
-    ]
-    for sequence_lengths, inputs, tag_bitmap in zip(
-            sequence_lengths_list, inputs_list, tag_bitmap_list):
-      sequence_score = text.crf_multitag_sequence_score(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_bitmap=tf.expand_dims(tag_bitmap, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      sequence_score = tf.squeeze(sequence_score, [0])
-      tf_sum_sequence_score = self.evaluate(sequence_score)
-      all_indices_list = [
-        single_index_bitmap.nonzero()[0]
-        for single_index_bitmap in tag_bitmap[:sequence_lengths]
-      ]
-      expected_sequence_scores = [
-        self.calculateSequenceScore(inputs, transition_params, indices,
-                                    sequence_lengths)
-        for indices in itertools.product(*all_indices_list)
-      ]
-      expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(
-        expected_sequence_scores)
-      self.assertAllClose(tf_sum_sequence_score,
-                          expected_log_sum_exp_sequence_scores)
-
-  def testCrfUnaryScore(self):
-    inputs = np.array(
-      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
-    for dtype in (np.int32, np.int64):
-      tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
-      sequence_lengths = np.array(3, dtype=np.int32)
-      unary_score = text.crf_unary_score(
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        inputs=tf.expand_dims(inputs, 0))
-      unary_score = tf.squeeze(unary_score, [0])
-      tf_unary_score = self.evaluate(unary_score)
-      expected_unary_score = sum(inputs[i][tag_indices[i]]
-                                 for i in range(sequence_lengths))
-      self.assertAllClose(tf_unary_score, expected_unary_score)
-
-  def testCrfBinaryScore(self):
-    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    binary_score = text.crf_binary_score(
-      tag_indices=tf.expand_dims(tag_indices, 0),
-      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-      transition_params=tf.constant(transition_params))
-    binary_score = tf.squeeze(binary_score, [0])
-    tf_binary_score = self.evaluate(binary_score)
-    expected_binary_score = sum(
-      transition_params[tag_indices[i], tag_indices[i + 1]]
-      for i in range(sequence_lengths - 1))
-    self.assertAllClose(tf_binary_score, expected_binary_score)
-
-  def testCrfLogNorm(self):
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    # Test both the length-1 and regular cases.
-    sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int64)
-    ]
-    inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
-               dtype=np.float32),
-      np.array([[3, -1, 3]],
-               dtype=np.float32),
-    ]
-    tag_indices_list = [
-      np.array([1, 2, 1, 0], dtype=np.int32),
-      np.array([2], dtype=np.int32)
-    ]
-
-    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
-                                                     inputs_list,
-                                                     tag_indices_list):
-      num_words = inputs.shape[0]
-      num_tags = inputs.shape[1]
-      all_sequence_scores = []
-
-      # Compare the dynamic program with brute force computation.
-      for tag_indices in itertools.product(
-              range(num_tags), repeat=sequence_lengths):
-        tag_indices = list(tag_indices)
-        tag_indices.extend([0] * (num_words - sequence_lengths))
-        all_sequence_scores.append(
-          text.crf_sequence_score(
-            inputs=tf.expand_dims(inputs, 0),
+    def calculateSequenceScore(self, inputs, transition_params, tag_indices,
+                               sequence_lengths):
+        expected_unary_score = sum(
+            inputs[i][tag_indices[i]] for i in range(sequence_lengths))
+        expected_binary_score = sum(
+            transition_params[tag_indices[i], tag_indices[i + 1]]
+            for i in range(sequence_lengths - 1))
+        return expected_unary_score + expected_binary_score
+
+    def testCrfSequenceScore(self):
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        # Test both the length-1 and regular cases.
+        sequence_lengths_list = [
+            np.array(3, dtype=np.int32),
+            np.array(1, dtype=np.int32)
+        ]
+        inputs_list = [
+            np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                     dtype=np.float32),
+            np.array([[4, 5, -3]], dtype=np.float32),
+        ]
+        tag_indices_list = [
+            np.array([1, 2, 1, 0], dtype=np.int32),
+            np.array([1], dtype=np.int32)
+        ]
+        for sequence_lengths, inputs, tag_indices in zip(
+                sequence_lengths_list, inputs_list, tag_indices_list):
+            sequence_score = text.crf_sequence_score(
+                inputs=tf.expand_dims(inputs, 0),
+                tag_indices=tf.expand_dims(tag_indices, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            sequence_score = tf.squeeze(sequence_score, [0])
+
+            tf_sequence_score = self.evaluate(sequence_score)
+
+            expected_sequence_score = self.calculateSequenceScore(
+                inputs, transition_params, tag_indices, sequence_lengths)
+            self.assertAllClose(tf_sequence_score, expected_sequence_score)
+
+    def testCrfMultiTagSequenceScore(self):
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        # Test both the length-1 and regular cases.
+        sequence_lengths_list = [
+            np.array(3, dtype=np.int32),
+            np.array(1, dtype=np.int32)
+        ]
+        inputs_list = [
+            np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                     dtype=np.float32),
+            np.array([[4, 5, -3]], dtype=np.float32),
+        ]
+        tag_bitmap_list = [
+            np.array([[True, True, False], [True, False, True],
+                      [False, True, True], [True, False, True]],
+                     dtype=np.bool),
+            np.array([[True, True, False]], dtype=np.bool)
+        ]
+        for sequence_lengths, inputs, tag_bitmap in zip(
+                sequence_lengths_list, inputs_list, tag_bitmap_list):
+            sequence_score = text.crf_multitag_sequence_score(
+                inputs=tf.expand_dims(inputs, 0),
+                tag_bitmap=tf.expand_dims(tag_bitmap, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            sequence_score = tf.squeeze(sequence_score, [0])
+            tf_sum_sequence_score = self.evaluate(sequence_score)
+            all_indices_list = [
+                single_index_bitmap.nonzero()[0]
+                for single_index_bitmap in tag_bitmap[:sequence_lengths]
+            ]
+            expected_sequence_scores = [
+                self.calculateSequenceScore(inputs, transition_params, indices,
+                                            sequence_lengths)
+                for indices in itertools.product(*all_indices_list)
+            ]
+            expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(
+                expected_sequence_scores)
+            self.assertAllClose(tf_sum_sequence_score,
+                                expected_log_sum_exp_sequence_scores)
+
+    def testCrfUnaryScore(self):
+        inputs = np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                          dtype=np.float32)
+        for dtype in (np.int32, np.int64):
+            tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
+            sequence_lengths = np.array(3, dtype=np.int32)
+            unary_score = text.crf_unary_score(
+                tag_indices=tf.expand_dims(tag_indices, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                inputs=tf.expand_dims(inputs, 0))
+            unary_score = tf.squeeze(unary_score, [0])
+            tf_unary_score = self.evaluate(unary_score)
+            expected_unary_score = sum(
+                inputs[i][tag_indices[i]] for i in range(sequence_lengths))
+            self.assertAllClose(tf_unary_score, expected_unary_score)
+
+    def testCrfBinaryScore(self):
+        tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        sequence_lengths = np.array(3, dtype=np.int32)
+        binary_score = text.crf_binary_score(
             tag_indices=tf.expand_dims(tag_indices, 0),
             sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-            transition_params=tf.constant(transition_params)))
-
-      brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
-      log_norm = text.crf_log_norm(
-        inputs=tf.expand_dims(inputs, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      log_norm = tf.squeeze(log_norm, [0])
-      tf_brute_force_log_norm, tf_log_norm = self.evaluate(
-        [brute_force_log_norm, log_norm])
-
-      self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
-
-  def testCrfLogNormZeroSeqLength(self):
-    """
-    Test `crf_log_norm` when `sequence_lengths` contains one or more zeros.
-    """
-    inputs = tf.constant(np.ones([2, 10, 5],
-                                 dtype=np.float32))
-    transition_params = tf.constant(np.ones([5, 5],
-                                            dtype=np.float32))
-    sequence_lengths = tf.constant(np.zeros([2],
-                                            dtype=np.int32))
-    expected_log_norm = np.zeros([2], dtype=np.float32)
-    log_norm = text.crf_log_norm(inputs, sequence_lengths, transition_params)
-    tf_log_norm = self.evaluate(log_norm)
-    self.assertAllClose(tf_log_norm, expected_log_norm)
-
-  def testCrfLogLikelihood(self):
-    inputs = np.array(
-      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    num_words = inputs.shape[0]
-    num_tags = inputs.shape[1]
-    all_sequence_log_likelihoods = []
-
-    # Make sure all probabilities sum to 1.
-    for tag_indices in itertools.product(
-            range(num_tags), repeat=sequence_lengths):
-      tag_indices = list(tag_indices)
-      tag_indices.extend([0] * (num_words - sequence_lengths))
-      sequence_log_likelihood, _ = text.crf_log_likelihood(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      all_sequence_log_likelihoods.append(sequence_log_likelihood)
-    total_log_likelihood = tf.reduce_logsumexp(
-      all_sequence_log_likelihoods)
-    tf_total_log_likelihood = self.evaluate(total_log_likelihood)
-    self.assertAllClose(tf_total_log_likelihood, 0.0)
-
-  def testViterbiDecode(self):
-    inputs = np.array(
-      [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    num_words = inputs.shape[0]
-    num_tags = inputs.shape[1]
-
-    all_sequence_scores = []
-    all_sequences = []
-
-    # Compare the dynamic program with brute force computation.
-    for tag_indices in itertools.product(
-            range(num_tags), repeat=sequence_lengths):
-      tag_indices = list(tag_indices)
-      tag_indices.extend([0] * (num_words - sequence_lengths))
-      all_sequences.append(tag_indices)
-      sequence_score = text.crf_sequence_score(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params))
-      sequence_score = tf.squeeze(sequence_score, [0])
-      all_sequence_scores.append(sequence_score)
-
-    tf_all_sequence_scores = self.evaluate(all_sequence_scores)
-
-    expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
-    expected_max_sequence = all_sequences[expected_max_sequence_index]
-    expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
-
-    actual_max_sequence, actual_max_score = text.viterbi_decode(
-      inputs[:sequence_lengths], transition_params)
-
-    self.assertAllClose(actual_max_score, expected_max_score)
-    self.assertEqual(actual_max_sequence,
-                     expected_max_sequence[:sequence_lengths])
-
-  def testCrfDecode(self):
-    transition_params = np.array(
-      [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    # Test both the length-1 and regular cases.
-    sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int64)
-    ]
-    inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
-               dtype=np.float32),
-      np.array([[-1, 2, 1]],
-               dtype=np.float32),
-    ]
-    tag_indices_list = [
-      np.array([1, 2, 1, 0], dtype=np.int32),
-      np.array([2], dtype=np.int32)
-    ]
-
-    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
-                                                     inputs_list,
-                                                     tag_indices_list):
-      num_words = inputs.shape[0]
-      num_tags = inputs.shape[1]
-
-      all_sequence_scores = []
-      all_sequences = []
-
-      # Compare the dynamic program with brute force computation.
-      for tag_indices in itertools.product(
-              range(num_tags), repeat=sequence_lengths):
-        tag_indices = list(tag_indices)
-        tag_indices.extend([0] * (num_words - sequence_lengths))
-        all_sequences.append(tag_indices)
-        sequence_score = text.crf_sequence_score(
-          inputs=tf.expand_dims(inputs, 0),
-          tag_indices=tf.expand_dims(tag_indices, 0),
-          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-          transition_params=tf.constant(transition_params))
-        sequence_score = tf.squeeze(sequence_score, [0])
-        all_sequence_scores.append(sequence_score)
-
-      tf_all_sequence_scores = self.evaluate(all_sequence_scores)
-
-      expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
-      expected_max_sequence = all_sequences[expected_max_sequence_index]
-      expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
-
-      actual_max_sequence, actual_max_score = text.crf_decode(
-        tf.expand_dims(inputs, 0),
-        tf.constant(transition_params),
-        tf.expand_dims(sequence_lengths, 0))
-      actual_max_sequence = tf.squeeze(actual_max_sequence, [0])
-      actual_max_score = tf.squeeze(actual_max_score, [0])
-      tf_actual_max_sequence, tf_actual_max_score = self.evaluate(
-        [actual_max_sequence, actual_max_score])
-
-      self.assertAllClose(tf_actual_max_score, expected_max_score)
-      self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
-                       expected_max_sequence[:sequence_lengths])
-
-  def testCrfDecodeZeroSeqLength(self):
-    """
-    Test that crf_decode works when sequence_length contains one or more zeros.
-    """
-    inputs = tf.constant(np.ones([2, 10, 5],
-                                 dtype=np.float32))
-    transition_params = tf.constant(np.ones([5, 5],
-                                            dtype=np.float32))
-    sequence_lengths = tf.constant(np.zeros([2],
-                                            dtype=np.int32))
-    tags, scores = text.crf_decode(inputs, transition_params, sequence_lengths)
-    tf_tags, tf_scores = self.evaluate([tags, scores])
-    self.assertEqual(len(tf_tags.shape), 2)
-    self.assertEqual(len(tf_scores.shape), 1)
+            transition_params=tf.constant(transition_params))
+        binary_score = tf.squeeze(binary_score, [0])
+        tf_binary_score = self.evaluate(binary_score)
+        expected_binary_score = sum(
+            transition_params[tag_indices[i], tag_indices[i + 1]]
+            for i in range(sequence_lengths - 1))
+        self.assertAllClose(tf_binary_score, expected_binary_score)
+
+    def testCrfLogNorm(self):
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        # Test both the length-1 and regular cases.
+        sequence_lengths_list = [
+            np.array(3, dtype=np.int32),
+            np.array(1, dtype=np.int64)
+        ]
+        inputs_list = [
+            np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                     dtype=np.float32),
+            np.array([[3, -1, 3]], dtype=np.float32),
+        ]
+        tag_indices_list = [
+            np.array([1, 2, 1, 0], dtype=np.int32),
+            np.array([2], dtype=np.int32)
+        ]
+
+        for sequence_lengths, inputs, tag_indices in zip(
+                sequence_lengths_list, inputs_list, tag_indices_list):
+            num_words = inputs.shape[0]
+            num_tags = inputs.shape[1]
+            all_sequence_scores = []
+
+            # Compare the dynamic program with brute force computation.
+            for tag_indices in itertools.product(
+                    range(num_tags), repeat=sequence_lengths):
+                tag_indices = list(tag_indices)
+                tag_indices.extend([0] * (num_words - sequence_lengths))
+                all_sequence_scores.append(
+                    text.crf_sequence_score(
+                        inputs=tf.expand_dims(inputs, 0),
+                        tag_indices=tf.expand_dims(tag_indices, 0),
+                        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                        transition_params=tf.constant(transition_params)))
+
+            brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
+            log_norm = text.crf_log_norm(
+                inputs=tf.expand_dims(inputs, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            log_norm = tf.squeeze(log_norm, [0])
+            tf_brute_force_log_norm, tf_log_norm = self.evaluate(
+                [brute_force_log_norm, log_norm])
+
+            self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
+
+    def testCrfLogNormZeroSeqLength(self):
+        """Test `crf_log_norm` when `sequence_lengths` contains one or more
+        zeros."""
+        inputs = tf.constant(np.ones([2, 10, 5], dtype=np.float32))
+        transition_params = tf.constant(np.ones([5, 5], dtype=np.float32))
+        sequence_lengths = tf.constant(np.zeros([2], dtype=np.int32))
+        expected_log_norm = np.zeros([2], dtype=np.float32)
+        log_norm = text.crf_log_norm(inputs, sequence_lengths,
+                                     transition_params)
+        tf_log_norm = self.evaluate(log_norm)
+        self.assertAllClose(tf_log_norm, expected_log_norm)
+
+    def testCrfLogLikelihood(self):
+        inputs = np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                          dtype=np.float32)
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        sequence_lengths = np.array(3, dtype=np.int32)
+        num_words = inputs.shape[0]
+        num_tags = inputs.shape[1]
+        all_sequence_log_likelihoods = []
+
+        # Make sure all probabilities sum to 1.
+        for tag_indices in itertools.product(
+                range(num_tags), repeat=sequence_lengths):
+            tag_indices = list(tag_indices)
+            tag_indices.extend([0] * (num_words - sequence_lengths))
+            sequence_log_likelihood, _ = text.crf_log_likelihood(
+                inputs=tf.expand_dims(inputs, 0),
+                tag_indices=tf.expand_dims(tag_indices, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            all_sequence_log_likelihoods.append(sequence_log_likelihood)
+        total_log_likelihood = tf.reduce_logsumexp(
+            all_sequence_log_likelihoods)
+        tf_total_log_likelihood = self.evaluate(total_log_likelihood)
+        self.assertAllClose(tf_total_log_likelihood, 0.0)
+
+    def testViterbiDecode(self):
+        inputs = np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                          dtype=np.float32)
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        sequence_lengths = np.array(3, dtype=np.int32)
+        num_words = inputs.shape[0]
+        num_tags = inputs.shape[1]
+
+        all_sequence_scores = []
+        all_sequences = []
+
+        # Compare the dynamic program with brute force computation.
+        for tag_indices in itertools.product(
+                range(num_tags), repeat=sequence_lengths):
+            tag_indices = list(tag_indices)
+            tag_indices.extend([0] * (num_words - sequence_lengths))
+            all_sequences.append(tag_indices)
+            sequence_score = text.crf_sequence_score(
+                inputs=tf.expand_dims(inputs, 0),
+                tag_indices=tf.expand_dims(tag_indices, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params))
+            sequence_score = tf.squeeze(sequence_score, [0])
+            all_sequence_scores.append(sequence_score)
+
+        tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+        expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+        expected_max_sequence = all_sequences[expected_max_sequence_index]
+        expected_max_score = tf_all_sequence_scores[
+            expected_max_sequence_index]
+
+        actual_max_sequence, actual_max_score = text.viterbi_decode(
+            inputs[:sequence_lengths], transition_params)
+
+        self.assertAllClose(actual_max_score, expected_max_score)
+        self.assertEqual(actual_max_sequence,
+                         expected_max_sequence[:sequence_lengths])
+
+    def testCrfDecode(self):
+        transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]],
+                                     dtype=np.float32)
+        # Test both the length-1 and regular cases.
+        sequence_lengths_list = [
+            np.array(3, dtype=np.int32),
+            np.array(1, dtype=np.int64)
+        ]
+        inputs_list = [
+            np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                     dtype=np.float32),
+            np.array([[-1, 2, 1]], dtype=np.float32),
+        ]
+        tag_indices_list = [
+            np.array([1, 2, 1, 0], dtype=np.int32),
+            np.array([2], dtype=np.int32)
+        ]
+
+        for sequence_lengths, inputs, tag_indices in zip(
+                sequence_lengths_list, inputs_list, tag_indices_list):
+            num_words = inputs.shape[0]
+            num_tags = inputs.shape[1]
+
+            all_sequence_scores = []
+            all_sequences = []
+
+            # Compare the dynamic program with brute force computation.
+            for tag_indices in itertools.product(
+                    range(num_tags), repeat=sequence_lengths):
+                tag_indices = list(tag_indices)
+                tag_indices.extend([0] * (num_words - sequence_lengths))
+                all_sequences.append(tag_indices)
+                sequence_score = text.crf_sequence_score(
+                    inputs=tf.expand_dims(inputs, 0),
+                    tag_indices=tf.expand_dims(tag_indices, 0),
+                    sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                    transition_params=tf.constant(transition_params))
+                sequence_score = tf.squeeze(sequence_score, [0])
+                all_sequence_scores.append(sequence_score)
+
+            tf_all_sequence_scores = self.evaluate(all_sequence_scores)
+
+            expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+            expected_max_sequence = all_sequences[expected_max_sequence_index]
+            expected_max_score = tf_all_sequence_scores[
+                expected_max_sequence_index]
+
+            actual_max_sequence, actual_max_score = text.crf_decode(
+                tf.expand_dims(inputs, 0), tf.constant(transition_params),
+                tf.expand_dims(sequence_lengths, 0))
+            actual_max_sequence = tf.squeeze(actual_max_sequence, [0])
+            actual_max_score = tf.squeeze(actual_max_score, [0])
+            tf_actual_max_sequence, tf_actual_max_score = self.evaluate(
+                [actual_max_sequence, actual_max_score])
+
+            self.assertAllClose(tf_actual_max_score, expected_max_score)
+            self.assertEqual(
+                list(tf_actual_max_sequence[:sequence_lengths]),
+                expected_max_sequence[:sequence_lengths])
+
+    def testCrfDecodeZeroSeqLength(self):
+        """Test that crf_decode works when sequence_length contains one or more
+        zeros."""
+        inputs = tf.constant(np.ones([2, 10, 5], dtype=np.float32))
+        transition_params = tf.constant(np.ones([5, 5], dtype=np.float32))
+        sequence_lengths = tf.constant(np.zeros([2], dtype=np.int32))
+        tags, scores = text.crf_decode(inputs, transition_params,
+                                       sequence_lengths)
+        tf_tags, tf_scores = self.evaluate([tags, scores])
+        self.assertEqual(len(tf_tags.shape), 2)
+        self.assertEqual(len(tf_scores.shape), 1)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()

From a90b478721fbbe65ad038524ed0bc26556d01179 Mon Sep 17 00:00:00 2001
From: "Dheeraj R. Reddy" <dheeraj98reddy@gmail.com>
Date: Tue, 25 Jun 2019 11:44:45 +0530
Subject: [PATCH 07/52] Rename files and minor fixes

* Rename crf_ops* -> crf*
* The RNN cells inherit `AbstractRNNCell` instead of `Layer`
* Remove used `training` variable
* Add docstring for RNN Cells
---
 tensorflow_addons/text/BUILD                  |  8 +--
 tensorflow_addons/text/__init__.py            | 27 +++++----
 tensorflow_addons/text/{crf_ops.py => crf.py} | 60 +++++++++++++++----
 .../text/{crf_ops_test.py => crf_test.py}     |  0
 4 files changed, 67 insertions(+), 28 deletions(-)
 rename tensorflow_addons/text/{crf_ops.py => crf.py} (92%)
 rename tensorflow_addons/text/{crf_ops_test.py => crf_test.py} (100%)

diff --git a/tensorflow_addons/text/BUILD b/tensorflow_addons/text/BUILD
index d96bdd582b..21306ef3f9 100644
--- a/tensorflow_addons/text/BUILD
+++ b/tensorflow_addons/text/BUILD
@@ -6,7 +6,7 @@ py_library(
     name = "text",
     srcs = ([
         "__init__.py",
-        "crf_ops.py",
+        "crf.py",
         "skip_gram_ops.py",
     ]),
     data = [
@@ -17,12 +17,12 @@ py_library(
 )
 
 py_test(
-    name = "crf_ops_test",
+    name = "crf_test",
     size = "small",
     srcs = [
-        "crf_ops_test.py",
+        "crf_test.py",
     ],
-    main = "crf_ops_test.py",
+    main = "crf_test.py",
     srcs_version = "PY2AND3",
     deps = [
         ":text",
diff --git a/tensorflow_addons/text/__init__.py b/tensorflow_addons/text/__init__.py
index 6c67afa387..865b72725a 100644
--- a/tensorflow_addons/text/__init__.py
+++ b/tensorflow_addons/text/__init__.py
@@ -17,18 +17,19 @@
 from __future__ import division
 from __future__ import print_function
 
+# Conditional Random Field
+from tensorflow_addons.text.crf import crf_binary_score
+from tensorflow_addons.text.crf import crf_decode
+from tensorflow_addons.text.crf import crf_log_likelihood
+from tensorflow_addons.text.crf import crf_log_norm
+from tensorflow_addons.text.crf import crf_multitag_sequence_score
+from tensorflow_addons.text.crf import crf_sequence_score
+from tensorflow_addons.text.crf import crf_unary_score
+from tensorflow_addons.text.crf import CrfDecodeBackwardRnnCell
+from tensorflow_addons.text.crf import CrfDecodeForwardRnnCell
+from tensorflow_addons.text.crf import CrfForwardRnnCell
+from tensorflow_addons.text.crf import viterbi_decode
+
 # Skip Gram Sampling
 from tensorflow_addons.text.skip_gram_ops import skip_gram_sample
-from tensorflow_addons.text.skip_gram_ops import skip_gram_sample_with_text_vocab
-
-from tensorflow_addons.text.crf_ops import crf_binary_score
-from tensorflow_addons.text.crf_ops import crf_decode
-from tensorflow_addons.text.crf_ops import crf_log_likelihood
-from tensorflow_addons.text.crf_ops import crf_log_norm
-from tensorflow_addons.text.crf_ops import crf_multitag_sequence_score
-from tensorflow_addons.text.crf_ops import crf_sequence_score
-from tensorflow_addons.text.crf_ops import crf_unary_score
-from tensorflow_addons.text.crf_ops import CrfDecodeBackwardRnnCell
-from tensorflow_addons.text.crf_ops import CrfDecodeForwardRnnCell
-from tensorflow_addons.text.crf_ops import CrfForwardRnnCell
-from tensorflow_addons.text.crf_ops import viterbi_decode
+from tensorflow_addons.text.skip_gram_ops import skip_gram_sample_with_text_vocab
\ No newline at end of file
diff --git a/tensorflow_addons/text/crf_ops.py b/tensorflow_addons/text/crf.py
similarity index 92%
rename from tensorflow_addons/text/crf_ops.py
rename to tensorflow_addons/text/crf.py
index d91cceaebc..8d5e3ceef7 100644
--- a/tensorflow_addons/text/crf_ops.py
+++ b/tensorflow_addons/text/crf.py
@@ -280,18 +280,45 @@ def crf_binary_score(tag_indices, sequence_lengths, transition_params):
     return binary_scores
 
 
-class CrfForwardRnnCell(tf.keras.layers.Layer):
+class CrfForwardRnnCell(tf.keras.layers.AbstractRNNCell):
+    """Computes the alpha values in a linear-chain CRF.
+
+    See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
+    """
     def __init__(self, transition_params, **kwargs):
+        """Initialize the CrfForwardRnnCell.
+        Args:
+            transition_params: A [num_tags, num_tags] matrix of binary 
+                potentials. This matrix is expanded into a 
+                [1, num_tags, num_tags] in preparation for the 
+                broadcast summation occurring within the cell.
+        """
         super(CrfForwardRnnCell, self).__init__(**kwargs)
         self._transition_params = tf.expand_dims(transition_params, 0)
         self._num_tags = transition_params.shape[0]
-        self.state_size = self._num_tags
-        self.output_size = self._num_tags
+
+    @property
+    def state_size(self):
+        return self._num_tags
+
+    @property
+    def output_size(self):
+        return self._num_tags
 
     def build(self, input_shape):
         super(CrfForwardRnnCell, self).build(input_shape)
 
-    def call(self, inputs, state, training=None):
+    def call(self, inputs, state):
+        """Build the CrfForwardRnnCell.
+        Args:
+            inputs: A [batch_size, num_tags] matrix of unary potentials.
+            state: A [batch_size, num_tags] matrix containing the 
+                previous alpha values.
+            scope: Unused variable scope of this cell.
+            Returns:
+            new_alphas, new_alphas: A pair of [batch_size, num_tags] 
+                matrices values containing the new alpha values.
+        """
         state = tf.expand_dims(state[0], 2)
         transition_scores = state + self._transition_params
         new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
@@ -330,7 +357,7 @@ def viterbi_decode(score, transition_params):
     return viterbi, viterbi_score
 
 
-class CrfDecodeForwardRnnCell(tf.keras.layers.Layer):
+class CrfDecodeForwardRnnCell(tf.keras.layers.AbstractRNNCell):
     """Computes the forward decoding in a linear-chain CRF."""
 
     def __init__(self, transition_params, **kwargs):
@@ -345,13 +372,19 @@ def __init__(self, transition_params, **kwargs):
         super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
         self._transition_params = tf.expand_dims(transition_params, 0)
         self._num_tags = transition_params.shape[0]
-        self.state_size = self._num_tags
-        self.output_size = self._num_tags
+    
+    @property
+    def state_size(self):
+        return self._num_tags
+
+    @property
+    def output_size(self):
+        return self._num_tags
 
     def build(self, input_shape):
         super(CrfDecodeForwardRnnCell, self).build(input_shape)
 
-    def call(self, inputs, state, training=None):
+    def call(self, inputs, state):
         state = tf.expand_dims(state[0], 2)
         transition_scores = state + self._transition_params
         new_state = inputs + tf.reduce_max(transition_scores, [1])
@@ -372,13 +405,18 @@ def __init__(self, num_tags, **kwargs):
         super(CrfDecodeBackwardRnnCell, self).__init__(**kwargs)
         self._num_tags = num_tags
 
-        self.state_size = 1
-        self.output_size = 1
+    @property
+    def state_size(self):
+        return 1
+
+    @property
+    def output_size(self):
+        return 1
 
     def build(self, input_shape):
         super(CrfDecodeBackwardRnnCell, self).build(input_shape)
 
-    def call(self, inputs, state, training=None):
+    def call(self, inputs, state):
         state = tf.squeeze(state[0], axis=[1])
         batch_size = tf.shape(inputs)[0]
         b_indices = tf.range(batch_size)
diff --git a/tensorflow_addons/text/crf_ops_test.py b/tensorflow_addons/text/crf_test.py
similarity index 100%
rename from tensorflow_addons/text/crf_ops_test.py
rename to tensorflow_addons/text/crf_test.py

From d25b747611689d7258c30487fa8ca1acda43e07a Mon Sep 17 00:00:00 2001
From: "Dheeraj R. Reddy" <dheeraj98reddy@gmail.com>
Date: Tue, 25 Jun 2019 11:49:56 +0530
Subject: [PATCH 08/52] code format

---
 tensorflow_addons/text/crf.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow_addons/text/crf.py b/tensorflow_addons/text/crf.py
index 8d5e3ceef7..8965305d42 100644
--- a/tensorflow_addons/text/crf.py
+++ b/tensorflow_addons/text/crf.py
@@ -285,6 +285,7 @@ class CrfForwardRnnCell(tf.keras.layers.AbstractRNNCell):
 
     See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
     """
+
     def __init__(self, transition_params, **kwargs):
         """Initialize the CrfForwardRnnCell.
         Args:
@@ -310,13 +311,14 @@ def build(self, input_shape):
 
     def call(self, inputs, state):
         """Build the CrfForwardRnnCell.
+
         Args:
             inputs: A [batch_size, num_tags] matrix of unary potentials.
-            state: A [batch_size, num_tags] matrix containing the 
+            state: A [batch_size, num_tags] matrix containing the
                 previous alpha values.
             scope: Unused variable scope of this cell.
             Returns:
-            new_alphas, new_alphas: A pair of [batch_size, num_tags] 
+            new_alphas, new_alphas: A pair of [batch_size, num_tags]
                 matrices values containing the new alpha values.
         """
         state = tf.expand_dims(state[0], 2)
@@ -372,7 +374,7 @@ def __init__(self, transition_params, **kwargs):
         super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
         self._transition_params = tf.expand_dims(transition_params, 0)
         self._num_tags = transition_params.shape[0]
-    
+
     @property
     def state_size(self):
         return self._num_tags

From 34811e64ac4ddf12b0d308bca841e1982beb5900 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 25 Jun 2019 18:45:14 +0800
Subject: [PATCH 09/52] save work progress

---
 tensorflow_addons/layers/crf.py           |  47 ++
 tensorflow_addons/layers/old_crf.py       | 573 ++++++++++++++++++++++
 tensorflow_addons/losses/crf_losses.py    |  53 ++
 tensorflow_addons/metrics/crf_accuracy.py |   0
 tensorflow_addons/metrics/marginal_acc.py |   0
 tensorflow_addons/metrics/viterbi_acc.py  |   0
 6 files changed, 673 insertions(+)
 create mode 100644 tensorflow_addons/layers/crf.py
 create mode 100644 tensorflow_addons/layers/old_crf.py
 create mode 100644 tensorflow_addons/losses/crf_losses.py
 create mode 100644 tensorflow_addons/metrics/crf_accuracy.py
 create mode 100644 tensorflow_addons/metrics/marginal_acc.py
 create mode 100644 tensorflow_addons/metrics/viterbi_acc.py

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
new file mode 100644
index 0000000000..7c0dd99c5b
--- /dev/null
+++ b/tensorflow_addons/layers/crf.py
@@ -0,0 +1,47 @@
+import tensorflow as tf
+
+from tensorflow_addons.text.crf import crf_decode
+
+
+class CRF(tf.keras.layers.Layer):
+    def __init__(self, units):
+        super(CRF, self).__init__()
+        self.units = units  # numbers of tags
+
+    def build(self, input_shape):
+        self.input_dim = input_shape[-1]
+
+        self.kernel = self.add_weight(shape=(self.input_dim, self.units),
+                                      name='kernel',
+                                      initializer=self.kernel_initializer,
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
+
+        self.chain_kernel = self.add_weight(shape=(self.units, self.units),
+                                            name='chain_kernel',
+                                            initializer=self.chain_initializer,
+                                            regularizer=self.chain_regularizer,
+                                            constraint=self.chain_constraint)
+
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(self.units,),
+                                        name='bias',
+                                        initializer=self.bias_initializer,
+                                        regularizer=self.bias_regularizer,
+                                        constraint=self.bias_constraint)
+        else:
+            self.bias = 0
+
+    def call(self, input, **kwargs):
+        logits = self._dense_layer(input)
+        pred_ids, _ = crf_decode(logits, self.chain_kernel, nwords)
+
+    def _dense_layer(self, input):
+        # TODO: can simply use tf.keras.layers.dense ?
+        return self.activation(tf.matmul(input, self.kernel) + self.bias)
+
+
+if __name__ == "__main__":
+    layer = CRF(10)
+    print(layer(tf.zeros([10, 5])))
+    print(layer.trainable_variables)
diff --git a/tensorflow_addons/layers/old_crf.py b/tensorflow_addons/layers/old_crf.py
new file mode 100644
index 0000000000..180f97c132
--- /dev/null
+++ b/tensorflow_addons/layers/old_crf.py
@@ -0,0 +1,573 @@
+from __future__ import absolute_import
+from __future__ import division
+
+import warnings
+
+from keras import backend as K
+from keras import activations
+from keras import initializers
+from keras import regularizers
+from keras import constraints
+from keras.layers import Layer
+from keras.layers import InputSpec
+
+from keras_contrib.losses import crf_loss
+from keras_contrib.metrics import crf_marginal_accuracy
+from keras_contrib.metrics import crf_viterbi_accuracy
+from keras_contrib.utils.test_utils import to_tuple
+
+
+class CRF(Layer):
+    """An implementation of linear chain conditional random field (CRF).
+    An linear chain CRF is defined to maximize the following likelihood function:
+    $$ L(W, U, b; y_1, ..., y_n) := \frac{1}{Z}
+    \sum_{y_1, ..., y_n} \exp(-a_1' y_1 - a_n' y_n
+        - \sum_{k=1^n}((f(x_k' W + b) y_k) + y_1' U y_2)), $$
+    where:
+        $Z$: normalization constant
+        $x_k, y_k$:  inputs and outputs
+    This implementation has two modes for optimization:
+    1. (`join mode`) optimized by maximizing join likelihood,
+    which is optimal in theory of statistics.
+       Note that in this case, CRF must be the output/last layer.
+    2. (`marginal mode`) return marginal probabilities on each time
+    step and optimized via composition
+       likelihood (product of marginal likelihood), i.e.,
+       using `categorical_crossentropy` loss.
+       Note that in this case, CRF can be either the last layer or an
+       intermediate layer (though not explored).
+    For prediction (test phrase), one can choose either Viterbi
+    best path (class indices) or marginal
+    probabilities if probabilities are needed.
+    However, if one chooses *join mode* for training,
+    Viterbi output is typically better than marginal output,
+    but the marginal output will still perform
+    reasonably close, while if *marginal mode* is used for training,
+    marginal output usually performs
+    much better. The default behavior and `metrics.crf_accuracy`
+    is set according to this observation.
+    In addition, this implementation supports masking and accepts either
+    onehot or sparse target.
+    If you open a issue or a pull request about CRF, please
+    add 'cc @lzfelix' to notify Luiz Felix.
+    # Examples
+    ```python
+        from keras_contrib.layers import CRF
+        from keras_contrib.losses import crf_loss
+        from keras_contrib.metrics import crf_viterbi_accuracy
+        model = Sequential()
+        model.add(Embedding(3001, 300, mask_zero=True)(X)
+        # use learn_mode = 'join', test_mode = 'viterbi',
+        # sparse_target = True (label indice output)
+        crf = CRF(10, sparse_target=True)
+        model.add(crf)
+        # crf_accuracy is default to Viterbi acc if using join-mode (default).
+        # One can add crf.marginal_acc if interested, but may slow down learning
+        model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
+        # y must be label indices (with shape 1 at dim 3) here,
+        # since `sparse_target=True`
+        model.fit(x, y)
+        # prediction give onehot representation of Viterbi best path
+        y_hat = model.predict(x_test)
+    ```
+    The following snippet shows how to load a persisted
+    model that uses the CRF layer:
+    ```python
+        from keras.models import load_model
+        from keras_contrib.losses import import crf_loss
+        from keras_contrib.metrics import crf_viterbi_accuracy
+        custom_objects={'CRF': CRF,
+                        'crf_loss': crf_loss,
+                        'crf_viterbi_accuracy': crf_viterbi_accuracy}
+        loaded_model = load_model('<path_to_model>',
+                                  custom_objects=custom_objects)
+    ```
+    # Arguments
+        units: Positive integer, dimensionality of the output space.
+        learn_mode: Either 'join' or 'marginal'.
+            The former train the model by maximizing join likelihood while the latter
+            maximize the product of marginal likelihood over all time steps.
+            One should use `losses.crf_nll` for 'join' mode
+            and `losses.categorical_crossentropy` or
+            `losses.sparse_categorical_crossentropy` for
+            `marginal` mode.  For convenience, simply
+            use `losses.crf_loss`, which will decide the proper loss as described.
+        test_mode: Either 'viterbi' or 'marginal'.
+            The former is recommended and as default when `learn_mode = 'join'` and
+            gives one-hot representation of the best path at test (prediction) time,
+            while the latter is recommended and chosen as default
+            when `learn_mode = 'marginal'`,
+            which produces marginal probabilities for each time step.
+            For evaluating metrics, one should
+            use `metrics.crf_viterbi_accuracy` for 'viterbi' mode and
+            'metrics.crf_marginal_accuracy' for 'marginal' mode, or
+            simply use `metrics.crf_accuracy` for
+            both which automatically decides it as described.
+            One can also use both for evaluation at training.
+        sparse_target: Boolean (default False) indicating
+            if provided labels are one-hot or
+            indices (with shape 1 at dim 3).
+        use_boundary: Boolean (default True) indicating if trainable
+            start-end chain energies
+            should be added to model.
+        use_bias: Boolean, whether the layer uses a bias vector.
+        kernel_initializer: Initializer for the `kernel` weights matrix,
+            used for the linear transformation of the inputs.
+            (see [initializers](../initializers.md)).
+        chain_initializer: Initializer for the `chain_kernel` weights matrix,
+            used for the CRF chain energy.
+            (see [initializers](../initializers.md)).
+        boundary_initializer: Initializer for the `left_boundary`,
+            'right_boundary' weights vectors,
+            used for the start/left and end/right boundary energy.
+            (see [initializers](../initializers.md)).
+        bias_initializer: Initializer for the bias vector
+            (see [initializers](../initializers.md)).
+        activation: Activation function to use
+            (see [activations](../activations.md)).
+            If you pass None, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
+        kernel_regularizer: Regularizer function applied to
+            the `kernel` weights matrix
+            (see [regularizer](../regularizers.md)).
+        chain_regularizer: Regularizer function applied to
+            the `chain_kernel` weights matrix
+            (see [regularizer](../regularizers.md)).
+        boundary_regularizer: Regularizer function applied to
+            the 'left_boundary', 'right_boundary' weight vectors
+            (see [regularizer](../regularizers.md)).
+        bias_regularizer: Regularizer function applied to the bias vector
+            (see [regularizer](../regularizers.md)).
+        kernel_constraint: Constraint function applied to
+            the `kernel` weights matrix
+            (see [constraints](../constraints.md)).
+        chain_constraint: Constraint function applied to
+            the `chain_kernel` weights matrix
+            (see [constraints](../constraints.md)).
+        boundary_constraint: Constraint function applied to
+            the `left_boundary`, `right_boundary` weights vectors
+            (see [constraints](../constraints.md)).
+        bias_constraint: Constraint function applied to the bias vector
+            (see [constraints](../constraints.md)).
+        input_dim: dimensionality of the input (integer).
+            This argument (or alternatively, the keyword argument `input_shape`)
+            is required when using this layer as the first layer in a model.
+        unroll: Boolean (default False). If True, the network will be
+            unrolled, else a symbolic loop will be used.
+            Unrolling can speed-up a RNN, although it tends
+            to be more memory-intensive.
+            Unrolling is only suitable for short sequences.
+    # Input shape
+        3D tensor with shape `(nb_samples, timesteps, input_dim)`.
+    # Output shape
+        3D tensor with shape `(nb_samples, timesteps, units)`.
+    # Masking
+        This layer supports masking for input data with a variable number
+        of timesteps. To introduce masks to your data,
+        use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
+        set to `True`.
+    """
+
+    def __init__(self, units,
+                 learn_mode='join',
+                 test_mode=None,
+                 sparse_target=False,
+                 use_boundary=True,
+                 use_bias=True,
+                 activation='linear',
+                 kernel_initializer='glorot_uniform',
+                 chain_initializer='orthogonal',
+                 bias_initializer='zeros',
+                 boundary_initializer='zeros',
+                 kernel_regularizer=None,
+                 chain_regularizer=None,
+                 boundary_regularizer=None,
+                 bias_regularizer=None,
+                 kernel_constraint=None,
+                 chain_constraint=None,
+                 boundary_constraint=None,
+                 bias_constraint=None,
+                 input_dim=None,
+                 unroll=False,
+                 **kwargs):
+        super(CRF, self).__init__(**kwargs)
+        self.supports_masking = True
+        self.units = units
+        self.learn_mode = learn_mode
+        assert self.learn_mode in ['join', 'marginal']
+        self.test_mode = test_mode
+        if self.test_mode is None:
+            self.test_mode = 'viterbi' if self.learn_mode == 'join' else 'marginal'
+        else:
+            assert self.test_mode in ['viterbi', 'marginal']
+        self.sparse_target = sparse_target
+        self.use_boundary = use_boundary
+        self.use_bias = use_bias
+
+        self.activation = activations.get(activation)
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.chain_initializer = initializers.get(chain_initializer)
+        self.boundary_initializer = initializers.get(boundary_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.chain_regularizer = regularizers.get(chain_regularizer)
+        self.boundary_regularizer = regularizers.get(boundary_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.chain_constraint = constraints.get(chain_constraint)
+        self.boundary_constraint = constraints.get(boundary_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+        self.unroll = unroll
+
+    def build(self, input_shape):
+        input_shape = to_tuple(input_shape)
+        self.input_spec = [InputSpec(shape=input_shape)]
+        self.input_dim = input_shape[-1]
+
+        self.kernel = self.add_weight(shape=(self.input_dim, self.units),
+                                      name='kernel',
+                                      initializer=self.kernel_initializer,
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
+        self.chain_kernel = self.add_weight(shape=(self.units, self.units),
+                                            name='chain_kernel',
+                                            initializer=self.chain_initializer,
+                                            regularizer=self.chain_regularizer,
+                                            constraint=self.chain_constraint)
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(self.units,),
+                                        name='bias',
+                                        initializer=self.bias_initializer,
+                                        regularizer=self.bias_regularizer,
+                                        constraint=self.bias_constraint)
+        else:
+            self.bias = 0
+
+        if self.use_boundary:
+            self.left_boundary = self.add_weight(shape=(self.units,),
+                                                 name='left_boundary',
+                                                 initializer=self.boundary_initializer,
+                                                 regularizer=self.boundary_regularizer,
+                                                 constraint=self.boundary_constraint)
+            self.right_boundary = self.add_weight(shape=(self.units,),
+                                                  name='right_boundary',
+                                                  initializer=self.boundary_initializer,
+                                                  regularizer=self.boundary_regularizer,
+                                                  constraint=self.boundary_constraint)
+        self.built = True
+
+    def call(self, X, mask=None):
+        if mask is not None:
+            assert K.ndim(mask) == 2, 'Input mask to CRF must have dim 2 if not None'
+
+        if self.test_mode == 'viterbi':
+            test_output = self.viterbi_decoding(X, mask)
+        else:
+            test_output = self.get_marginal_prob(X, mask)
+
+        self.uses_learning_phase = True
+        if self.learn_mode == 'join':
+            train_output = K.zeros_like(K.dot(X, self.kernel))
+            out = K.in_train_phase(train_output, test_output)
+        else:
+            if self.test_mode == 'viterbi':
+                train_output = self.get_marginal_prob(X, mask)
+                out = K.in_train_phase(train_output, test_output)
+            else:
+                out = test_output
+        return out
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[:2] + (self.units,)
+
+    def compute_mask(self, input, mask=None):
+        if mask is not None and self.learn_mode == 'join':
+            return K.any(mask, axis=1)
+        return mask
+
+    def get_config(self):
+        config = {
+            'units': self.units,
+            'learn_mode': self.learn_mode,
+            'test_mode': self.test_mode,
+            'use_boundary': self.use_boundary,
+            'use_bias': self.use_bias,
+            'sparse_target': self.sparse_target,
+            'kernel_initializer': initializers.serialize(self.kernel_initializer),
+            'chain_initializer': initializers.serialize(self.chain_initializer),
+            'boundary_initializer': initializers.serialize(
+                self.boundary_initializer),
+            'bias_initializer': initializers.serialize(self.bias_initializer),
+            'activation': activations.serialize(self.activation),
+            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+            'chain_regularizer': regularizers.serialize(self.chain_regularizer),
+            'boundary_regularizer': regularizers.serialize(
+                self.boundary_regularizer),
+            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+            'kernel_constraint': constraints.serialize(self.kernel_constraint),
+            'chain_constraint': constraints.serialize(self.chain_constraint),
+            'boundary_constraint': constraints.serialize(self.boundary_constraint),
+            'bias_constraint': constraints.serialize(self.bias_constraint),
+            'input_dim': self.input_dim,
+            'unroll': self.unroll}
+        base_config = super(CRF, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @property
+    def loss_function(self):
+        warnings.warn('CRF.loss_function is deprecated '
+                      'and it might be removed in the future. Please '
+                      'use losses.crf_loss instead.')
+        return crf_loss
+
+    @property
+    def accuracy(self):
+        warnings.warn('CRF.accuracy is deprecated and it '
+                      'might be removed in the future. Please '
+                      'use metrics.crf_accuracy')
+        if self.test_mode == 'viterbi':
+            return crf_viterbi_accuracy
+        else:
+            return crf_marginal_accuracy
+
+    @property
+    def viterbi_acc(self):
+        warnings.warn('CRF.viterbi_acc is deprecated and it might '
+                      'be removed in the future. Please '
+                      'use metrics.viterbi_acc instead.')
+        return crf_viterbi_accuracy
+
+    @property
+    def marginal_acc(self):
+        warnings.warn('CRF.moarginal_acc is deprecated and it '
+                      'might be removed in the future. Please '
+                      'use metrics.marginal_acc instead.')
+        return crf_marginal_accuracy
+
+    @staticmethod
+    def softmaxNd(x, axis=-1):
+        m = K.max(x, axis=axis, keepdims=True)
+        exp_x = K.exp(x - m)
+        prob_x = exp_x / K.sum(exp_x, axis=axis, keepdims=True)
+        return prob_x
+
+    @staticmethod
+    def shift_left(x, offset=1):
+        assert offset > 0
+        return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)
+
+    @staticmethod
+    def shift_right(x, offset=1):
+        assert offset > 0
+        return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)
+
+    def add_boundary_energy(self, energy, mask, start, end):
+        start = K.expand_dims(K.expand_dims(start, 0), 0)
+        end = K.expand_dims(K.expand_dims(end, 0), 0)
+        if mask is None:
+            energy = K.concatenate([energy[:, :1, :] + start, energy[:, 1:, :]],
+                                   axis=1)
+            energy = K.concatenate([energy[:, :-1, :], energy[:, -1:, :] + end],
+                                   axis=1)
+        else:
+            mask = K.expand_dims(K.cast(mask, K.floatx()))
+            start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
+            end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
+            energy = energy + start_mask * start
+            energy = energy + end_mask * end
+        return energy
+
+    def get_log_normalization_constant(self, input_energy, mask, **kwargs):
+        """Compute logarithm of the normalization constant Z, where
+        Z = sum exp(-E) -> logZ = log sum exp(-E) =: -nlogZ
+        """
+        # should have logZ[:, i] == logZ[:, j] for any i, j
+        logZ = self.recursion(input_energy, mask, return_sequences=False, **kwargs)
+        return logZ[:, 0]
+
+    def get_energy(self, y_true, input_energy, mask):
+        """Energy = a1' y1 + u1' y1 + y1' U y2 + u2' y2 + y2' U y3 + u3' y3 + an' y3
+        """
+        input_energy = K.sum(input_energy * y_true, 2)  # (B, T)
+        # (B, T-1)
+        chain_energy = K.sum(K.dot(y_true[:, :-1, :],
+                                   self.chain_kernel) * y_true[:, 1:, :], 2)
+
+        if mask is not None:
+            mask = K.cast(mask, K.floatx())
+            # (B, T-1), mask[:,:-1]*mask[:,1:] makes it work with any padding
+            chain_mask = mask[:, :-1] * mask[:, 1:]
+            input_energy = input_energy * mask
+            chain_energy = chain_energy * chain_mask
+        total_energy = K.sum(input_energy, -1) + K.sum(chain_energy, -1)  # (B, )
+
+        return total_energy
+
+    def get_negative_log_likelihood(self, y_true, X, mask):
+        """Compute the loss, i.e., negative log likelihood (normalize by number of time steps)
+           likelihood = 1/Z * exp(-E) ->  neg_log_like = - log(1/Z * exp(-E)) = logZ + E
+        """
+        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
+        if self.use_boundary:
+            input_energy = self.add_boundary_energy(input_energy, mask,
+                                                    self.left_boundary,
+                                                    self.right_boundary)
+        energy = self.get_energy(y_true, input_energy, mask)
+        logZ = self.get_log_normalization_constant(input_energy, mask,
+                                                   input_length=K.int_shape(X)[1])
+        nloglik = logZ + energy
+        if mask is not None:
+            nloglik = nloglik / K.sum(K.cast(mask, K.floatx()), 1)
+        else:
+            nloglik = nloglik / K.cast(K.shape(X)[1], K.floatx())
+        return nloglik
+
+    def step(self, input_energy_t, states, return_logZ=True):
+        # not in the following  `prev_target_val` has shape = (B, F)
+        # where B = batch_size, F = output feature dim
+        # Note: `i` is of float32, due to the behavior of `K.rnn`
+        prev_target_val, i, chain_energy = states[:3]
+        t = K.cast(i[0, 0], dtype='int32')
+        if len(states) > 3:
+            if K.backend() == 'theano':
+                m = states[3][:, t:(t + 2)]
+            else:
+                m = K.slice(states[3], [0, t], [-1, 2])
+            input_energy_t = input_energy_t * K.expand_dims(m[:, 0])
+            # (1, F, F)*(B, 1, 1) -> (B, F, F)
+            chain_energy = chain_energy * K.expand_dims(
+                K.expand_dims(m[:, 0] * m[:, 1]))
+        if return_logZ:
+            # shapes: (1, B, F) + (B, F, 1) -> (B, F, F)
+            energy = chain_energy + K.expand_dims(input_energy_t - prev_target_val, 2)
+            new_target_val = K.logsumexp(-energy, 1)  # shapes: (B, F)
+            return new_target_val, [new_target_val, i + 1]
+        else:
+            energy = chain_energy + K.expand_dims(input_energy_t + prev_target_val, 2)
+            min_energy = K.min(energy, 1)
+            # cast for tf-version `K.rnn
+            argmin_table = K.cast(K.argmin(energy, 1), K.floatx())
+            return argmin_table, [min_energy, i + 1]
+
+    def recursion(self, input_energy, mask=None, go_backwards=False,
+                  return_sequences=True, return_logZ=True, input_length=None):
+        """Forward (alpha) or backward (beta) recursion
+        If `return_logZ = True`, compute the logZ, the normalization constant:
+        \[ Z = \sum_{y1, y2, y3} exp(-E) # energy
+          = \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3))
+          = sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3))
+          sum_{y1} exp(-(u1' y1' + y1' W y2))) \]
+        Denote:
+            \[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \]
+            \[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \]
+            \[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \]
+        Note that:
+              yi's are one-hot vectors
+              u1, u3: boundary energies have been merged
+        If `return_logZ = False`, compute the Viterbi's best path lookup table.
+        """
+        chain_energy = self.chain_kernel
+        # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t
+        chain_energy = K.expand_dims(chain_energy, 0)
+        # shape=(B, F), dtype=float32
+        prev_target_val = K.zeros_like(input_energy[:, 0, :])
+
+        if go_backwards:
+            input_energy = K.reverse(input_energy, 1)
+            if mask is not None:
+                mask = K.reverse(mask, 1)
+
+        initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])]
+        constants = [chain_energy]
+
+        if mask is not None:
+            mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
+                           K.floatx())
+            constants.append(mask2)
+
+        def _step(input_energy_i, states):
+            return self.step(input_energy_i, states, return_logZ)
+
+        target_val_last, target_val_seq, _ = K.rnn(_step, input_energy,
+                                                   initial_states,
+                                                   constants=constants,
+                                                   input_length=input_length,
+                                                   unroll=self.unroll)
+
+        if return_sequences:
+            if go_backwards:
+                target_val_seq = K.reverse(target_val_seq, 1)
+            return target_val_seq
+        else:
+            return target_val_last
+
+    def forward_recursion(self, input_energy, **kwargs):
+        return self.recursion(input_energy, **kwargs)
+
+    def backward_recursion(self, input_energy, **kwargs):
+        return self.recursion(input_energy, go_backwards=True, **kwargs)
+
+    def get_marginal_prob(self, X, mask=None):
+        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
+        if self.use_boundary:
+            input_energy = self.add_boundary_energy(input_energy, mask,
+                                                    self.left_boundary,
+                                                    self.right_boundary)
+        input_length = K.int_shape(X)[1]
+        alpha = self.forward_recursion(input_energy, mask=mask,
+                                       input_length=input_length)
+        beta = self.backward_recursion(input_energy, mask=mask,
+                                       input_length=input_length)
+        if mask is not None:
+            input_energy = input_energy * K.expand_dims(K.cast(mask, K.floatx()))
+        margin = -(self.shift_right(alpha) + input_energy + self.shift_left(beta))
+        return self.softmaxNd(margin)
+
+    def viterbi_decoding(self, X, mask=None):
+        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
+        if self.use_boundary:
+            input_energy = self.add_boundary_energy(
+                input_energy, mask, self.left_boundary, self.right_boundary)
+
+        argmin_tables = self.recursion(input_energy, mask, return_logZ=False)
+        argmin_tables = K.cast(argmin_tables, 'int32')
+
+        # backward to find best path, `initial_best_idx` can be any,
+        # as all elements in the last argmin_table are the same
+        argmin_tables = K.reverse(argmin_tables, 1)
+        # matrix instead of vector is required by tf `K.rnn`
+        initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])]
+        if K.backend() == 'theano':
+            from theano import tensor as T
+            initial_best_idx = [T.unbroadcast(initial_best_idx[0], 1)]
+
+        def gather_each_row(params, indices):
+            n = K.shape(indices)[0]
+            if K.backend() == 'theano':
+                from theano import tensor as T
+                return params[T.arange(n), indices]
+            elif K.backend() == 'tensorflow':
+                import tensorflow as tf
+                indices = K.transpose(K.stack([tf.range(n), indices]))
+                return tf.gather_nd(params, indices)
+            else:
+                raise NotImplementedError
+
+        def find_path(argmin_table, best_idx):
+            next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0])
+            next_best_idx = K.expand_dims(next_best_idx)
+            if K.backend() == 'theano':
+                from theano import tensor as T
+                next_best_idx = T.unbroadcast(next_best_idx, 1)
+            return next_best_idx, [next_best_idx]
+
+        _, best_paths, _ = K.rnn(find_path, argmin_tables, initial_best_idx,
+                                 input_length=K.int_shape(X)[1], unroll=self.unroll)
+        best_paths = K.reverse(best_paths, 1)
+        best_paths = K.squeeze(best_paths, 2)
+
+        return K.one_hot(best_paths, self.units)
\ No newline at end of file
diff --git a/tensorflow_addons/losses/crf_losses.py b/tensorflow_addons/losses/crf_losses.py
new file mode 100644
index 0000000000..5729f7eb60
--- /dev/null
+++ b/tensorflow_addons/losses/crf_losses.py
@@ -0,0 +1,53 @@
+from keras import backend as K
+from keras.losses import categorical_crossentropy
+from keras.losses import sparse_categorical_crossentropy
+
+
+def crf_nll(y_true, y_pred):
+    """The negative log-likelihood for linear chain Conditional Random Field (CRF).
+    This loss function is only used when the `layers.CRF` layer
+    is trained in the "join" mode.
+    # Arguments
+        y_true: tensor with true targets.
+        y_pred: tensor with predicted targets.
+    # Returns
+        A scalar representing corresponding to the negative log-likelihood.
+    # Raises
+        TypeError: If CRF is not the last layer.
+    # About GitHub
+        If you open an issue or a pull request about CRF, please
+        add `cc @lzfelix` to notify Luiz Felix.
+    """
+
+    crf, idx = y_pred._keras_history[:2]
+    if crf._outbound_nodes:
+        raise TypeError('When learn_model="join", CRF must be the last layer.')
+    if crf.sparse_target:
+        y_true = K.one_hot(K.cast(y_true[:, :, 0], 'int32'), crf.units)
+    X = crf._inbound_nodes[idx].input_tensors[0]
+    mask = crf._inbound_nodes[idx].input_masks[0]
+    nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
+    return nloglik
+
+
+def crf_loss(y_true, y_pred):
+    """General CRF loss function depending on the learning mode.
+    # Arguments
+        y_true: tensor with true targets.
+        y_pred: tensor with predicted targets.
+    # Returns
+        If the CRF layer is being trained in the join mode, returns the negative
+        log-likelihood. Otherwise returns the categorical crossentropy implemented
+        by the underlying Keras backend.
+    # About GitHub
+        If you open an issue or a pull request about CRF, please
+        add `cc @lzfelix` to notify Luiz Felix.
+    """
+    crf, idx = y_pred._keras_history[:2]
+    if crf.learn_mode == 'join':
+        return crf_nll(y_true, y_pred)
+    else:
+        if crf.sparse_target:
+            return sparse_categorical_crossentropy(y_true, y_pred)
+        else:
+            return categorical_crossentropy(y_true, y_pred)
diff --git a/tensorflow_addons/metrics/crf_accuracy.py b/tensorflow_addons/metrics/crf_accuracy.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_addons/metrics/marginal_acc.py b/tensorflow_addons/metrics/marginal_acc.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_addons/metrics/viterbi_acc.py b/tensorflow_addons/metrics/viterbi_acc.py
new file mode 100644
index 0000000000..e69de29bb2

From 23460eb8594866c41b57ee35a3d5d36fbeee54f0 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 25 Jun 2019 18:45:14 +0800
Subject: [PATCH 10/52] save work progress

---
 tensorflow_addons/layers/crf.py           |  47 ++
 tensorflow_addons/layers/old_crf.py       | 573 ++++++++++++++++++++++
 tensorflow_addons/losses/crf_losses.py    |  53 ++
 tensorflow_addons/metrics/crf_accuracy.py |   0
 tensorflow_addons/metrics/marginal_acc.py |   0
 tensorflow_addons/metrics/viterbi_acc.py  |   0
 6 files changed, 673 insertions(+)
 create mode 100644 tensorflow_addons/layers/crf.py
 create mode 100644 tensorflow_addons/layers/old_crf.py
 create mode 100644 tensorflow_addons/losses/crf_losses.py
 create mode 100644 tensorflow_addons/metrics/crf_accuracy.py
 create mode 100644 tensorflow_addons/metrics/marginal_acc.py
 create mode 100644 tensorflow_addons/metrics/viterbi_acc.py

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
new file mode 100644
index 0000000000..7c0dd99c5b
--- /dev/null
+++ b/tensorflow_addons/layers/crf.py
@@ -0,0 +1,47 @@
+import tensorflow as tf
+
+from tensorflow_addons.text.crf import crf_decode
+
+
+class CRF(tf.keras.layers.Layer):
+    def __init__(self, units):
+        super(CRF, self).__init__()
+        self.units = units  # numbers of tags
+
+    def build(self, input_shape):
+        self.input_dim = input_shape[-1]
+
+        self.kernel = self.add_weight(shape=(self.input_dim, self.units),
+                                      name='kernel',
+                                      initializer=self.kernel_initializer,
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
+
+        self.chain_kernel = self.add_weight(shape=(self.units, self.units),
+                                            name='chain_kernel',
+                                            initializer=self.chain_initializer,
+                                            regularizer=self.chain_regularizer,
+                                            constraint=self.chain_constraint)
+
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(self.units,),
+                                        name='bias',
+                                        initializer=self.bias_initializer,
+                                        regularizer=self.bias_regularizer,
+                                        constraint=self.bias_constraint)
+        else:
+            self.bias = 0
+
+    def call(self, input, **kwargs):
+        logits = self._dense_layer(input)
+        pred_ids, _ = crf_decode(logits, self.chain_kernel, nwords)
+
+    def _dense_layer(self, input):
+        # TODO: can simply use tf.keras.layers.dense ?
+        return self.activation(tf.matmul(input, self.kernel) + self.bias)
+
+
+if __name__ == "__main__":
+    layer = CRF(10)
+    print(layer(tf.zeros([10, 5])))
+    print(layer.trainable_variables)
diff --git a/tensorflow_addons/layers/old_crf.py b/tensorflow_addons/layers/old_crf.py
new file mode 100644
index 0000000000..180f97c132
--- /dev/null
+++ b/tensorflow_addons/layers/old_crf.py
@@ -0,0 +1,573 @@
+from __future__ import absolute_import
+from __future__ import division
+
+import warnings
+
+from keras import backend as K
+from keras import activations
+from keras import initializers
+from keras import regularizers
+from keras import constraints
+from keras.layers import Layer
+from keras.layers import InputSpec
+
+from keras_contrib.losses import crf_loss
+from keras_contrib.metrics import crf_marginal_accuracy
+from keras_contrib.metrics import crf_viterbi_accuracy
+from keras_contrib.utils.test_utils import to_tuple
+
+
+class CRF(Layer):
+    """An implementation of linear chain conditional random field (CRF).
+    An linear chain CRF is defined to maximize the following likelihood function:
+    $$ L(W, U, b; y_1, ..., y_n) := \frac{1}{Z}
+    \sum_{y_1, ..., y_n} \exp(-a_1' y_1 - a_n' y_n
+        - \sum_{k=1^n}((f(x_k' W + b) y_k) + y_1' U y_2)), $$
+    where:
+        $Z$: normalization constant
+        $x_k, y_k$:  inputs and outputs
+    This implementation has two modes for optimization:
+    1. (`join mode`) optimized by maximizing join likelihood,
+    which is optimal in theory of statistics.
+       Note that in this case, CRF must be the output/last layer.
+    2. (`marginal mode`) return marginal probabilities on each time
+    step and optimized via composition
+       likelihood (product of marginal likelihood), i.e.,
+       using `categorical_crossentropy` loss.
+       Note that in this case, CRF can be either the last layer or an
+       intermediate layer (though not explored).
+    For prediction (test phrase), one can choose either Viterbi
+    best path (class indices) or marginal
+    probabilities if probabilities are needed.
+    However, if one chooses *join mode* for training,
+    Viterbi output is typically better than marginal output,
+    but the marginal output will still perform
+    reasonably close, while if *marginal mode* is used for training,
+    marginal output usually performs
+    much better. The default behavior and `metrics.crf_accuracy`
+    is set according to this observation.
+    In addition, this implementation supports masking and accepts either
+    onehot or sparse target.
+    If you open a issue or a pull request about CRF, please
+    add 'cc @lzfelix' to notify Luiz Felix.
+    # Examples
+    ```python
+        from keras_contrib.layers import CRF
+        from keras_contrib.losses import crf_loss
+        from keras_contrib.metrics import crf_viterbi_accuracy
+        model = Sequential()
+        model.add(Embedding(3001, 300, mask_zero=True)(X)
+        # use learn_mode = 'join', test_mode = 'viterbi',
+        # sparse_target = True (label indice output)
+        crf = CRF(10, sparse_target=True)
+        model.add(crf)
+        # crf_accuracy is default to Viterbi acc if using join-mode (default).
+        # One can add crf.marginal_acc if interested, but may slow down learning
+        model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
+        # y must be label indices (with shape 1 at dim 3) here,
+        # since `sparse_target=True`
+        model.fit(x, y)
+        # prediction give onehot representation of Viterbi best path
+        y_hat = model.predict(x_test)
+    ```
+    The following snippet shows how to load a persisted
+    model that uses the CRF layer:
+    ```python
+        from keras.models import load_model
+        from keras_contrib.losses import import crf_loss
+        from keras_contrib.metrics import crf_viterbi_accuracy
+        custom_objects={'CRF': CRF,
+                        'crf_loss': crf_loss,
+                        'crf_viterbi_accuracy': crf_viterbi_accuracy}
+        loaded_model = load_model('<path_to_model>',
+                                  custom_objects=custom_objects)
+    ```
+    # Arguments
+        units: Positive integer, dimensionality of the output space.
+        learn_mode: Either 'join' or 'marginal'.
+            The former train the model by maximizing join likelihood while the latter
+            maximize the product of marginal likelihood over all time steps.
+            One should use `losses.crf_nll` for 'join' mode
+            and `losses.categorical_crossentropy` or
+            `losses.sparse_categorical_crossentropy` for
+            `marginal` mode.  For convenience, simply
+            use `losses.crf_loss`, which will decide the proper loss as described.
+        test_mode: Either 'viterbi' or 'marginal'.
+            The former is recommended and as default when `learn_mode = 'join'` and
+            gives one-hot representation of the best path at test (prediction) time,
+            while the latter is recommended and chosen as default
+            when `learn_mode = 'marginal'`,
+            which produces marginal probabilities for each time step.
+            For evaluating metrics, one should
+            use `metrics.crf_viterbi_accuracy` for 'viterbi' mode and
+            'metrics.crf_marginal_accuracy' for 'marginal' mode, or
+            simply use `metrics.crf_accuracy` for
+            both which automatically decides it as described.
+            One can also use both for evaluation at training.
+        sparse_target: Boolean (default False) indicating
+            if provided labels are one-hot or
+            indices (with shape 1 at dim 3).
+        use_boundary: Boolean (default True) indicating if trainable
+            start-end chain energies
+            should be added to model.
+        use_bias: Boolean, whether the layer uses a bias vector.
+        kernel_initializer: Initializer for the `kernel` weights matrix,
+            used for the linear transformation of the inputs.
+            (see [initializers](../initializers.md)).
+        chain_initializer: Initializer for the `chain_kernel` weights matrix,
+            used for the CRF chain energy.
+            (see [initializers](../initializers.md)).
+        boundary_initializer: Initializer for the `left_boundary`,
+            'right_boundary' weights vectors,
+            used for the start/left and end/right boundary energy.
+            (see [initializers](../initializers.md)).
+        bias_initializer: Initializer for the bias vector
+            (see [initializers](../initializers.md)).
+        activation: Activation function to use
+            (see [activations](../activations.md)).
+            If you pass None, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
+        kernel_regularizer: Regularizer function applied to
+            the `kernel` weights matrix
+            (see [regularizer](../regularizers.md)).
+        chain_regularizer: Regularizer function applied to
+            the `chain_kernel` weights matrix
+            (see [regularizer](../regularizers.md)).
+        boundary_regularizer: Regularizer function applied to
+            the 'left_boundary', 'right_boundary' weight vectors
+            (see [regularizer](../regularizers.md)).
+        bias_regularizer: Regularizer function applied to the bias vector
+            (see [regularizer](../regularizers.md)).
+        kernel_constraint: Constraint function applied to
+            the `kernel` weights matrix
+            (see [constraints](../constraints.md)).
+        chain_constraint: Constraint function applied to
+            the `chain_kernel` weights matrix
+            (see [constraints](../constraints.md)).
+        boundary_constraint: Constraint function applied to
+            the `left_boundary`, `right_boundary` weights vectors
+            (see [constraints](../constraints.md)).
+        bias_constraint: Constraint function applied to the bias vector
+            (see [constraints](../constraints.md)).
+        input_dim: dimensionality of the input (integer).
+            This argument (or alternatively, the keyword argument `input_shape`)
+            is required when using this layer as the first layer in a model.
+        unroll: Boolean (default False). If True, the network will be
+            unrolled, else a symbolic loop will be used.
+            Unrolling can speed-up a RNN, although it tends
+            to be more memory-intensive.
+            Unrolling is only suitable for short sequences.
+    # Input shape
+        3D tensor with shape `(nb_samples, timesteps, input_dim)`.
+    # Output shape
+        3D tensor with shape `(nb_samples, timesteps, units)`.
+    # Masking
+        This layer supports masking for input data with a variable number
+        of timesteps. To introduce masks to your data,
+        use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
+        set to `True`.
+    """
+
+    def __init__(self, units,
+                 learn_mode='join',
+                 test_mode=None,
+                 sparse_target=False,
+                 use_boundary=True,
+                 use_bias=True,
+                 activation='linear',
+                 kernel_initializer='glorot_uniform',
+                 chain_initializer='orthogonal',
+                 bias_initializer='zeros',
+                 boundary_initializer='zeros',
+                 kernel_regularizer=None,
+                 chain_regularizer=None,
+                 boundary_regularizer=None,
+                 bias_regularizer=None,
+                 kernel_constraint=None,
+                 chain_constraint=None,
+                 boundary_constraint=None,
+                 bias_constraint=None,
+                 input_dim=None,
+                 unroll=False,
+                 **kwargs):
+        super(CRF, self).__init__(**kwargs)
+        self.supports_masking = True
+        self.units = units
+        self.learn_mode = learn_mode
+        assert self.learn_mode in ['join', 'marginal']
+        self.test_mode = test_mode
+        if self.test_mode is None:
+            self.test_mode = 'viterbi' if self.learn_mode == 'join' else 'marginal'
+        else:
+            assert self.test_mode in ['viterbi', 'marginal']
+        self.sparse_target = sparse_target
+        self.use_boundary = use_boundary
+        self.use_bias = use_bias
+
+        self.activation = activations.get(activation)
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.chain_initializer = initializers.get(chain_initializer)
+        self.boundary_initializer = initializers.get(boundary_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.chain_regularizer = regularizers.get(chain_regularizer)
+        self.boundary_regularizer = regularizers.get(boundary_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.chain_constraint = constraints.get(chain_constraint)
+        self.boundary_constraint = constraints.get(boundary_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+        self.unroll = unroll
+
+    def build(self, input_shape):
+        input_shape = to_tuple(input_shape)
+        self.input_spec = [InputSpec(shape=input_shape)]
+        self.input_dim = input_shape[-1]
+
+        self.kernel = self.add_weight(shape=(self.input_dim, self.units),
+                                      name='kernel',
+                                      initializer=self.kernel_initializer,
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
+        self.chain_kernel = self.add_weight(shape=(self.units, self.units),
+                                            name='chain_kernel',
+                                            initializer=self.chain_initializer,
+                                            regularizer=self.chain_regularizer,
+                                            constraint=self.chain_constraint)
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(self.units,),
+                                        name='bias',
+                                        initializer=self.bias_initializer,
+                                        regularizer=self.bias_regularizer,
+                                        constraint=self.bias_constraint)
+        else:
+            self.bias = 0
+
+        if self.use_boundary:
+            self.left_boundary = self.add_weight(shape=(self.units,),
+                                                 name='left_boundary',
+                                                 initializer=self.boundary_initializer,
+                                                 regularizer=self.boundary_regularizer,
+                                                 constraint=self.boundary_constraint)
+            self.right_boundary = self.add_weight(shape=(self.units,),
+                                                  name='right_boundary',
+                                                  initializer=self.boundary_initializer,
+                                                  regularizer=self.boundary_regularizer,
+                                                  constraint=self.boundary_constraint)
+        self.built = True
+
+    def call(self, X, mask=None):
+        if mask is not None:
+            assert K.ndim(mask) == 2, 'Input mask to CRF must have dim 2 if not None'
+
+        if self.test_mode == 'viterbi':
+            test_output = self.viterbi_decoding(X, mask)
+        else:
+            test_output = self.get_marginal_prob(X, mask)
+
+        self.uses_learning_phase = True
+        if self.learn_mode == 'join':
+            train_output = K.zeros_like(K.dot(X, self.kernel))
+            out = K.in_train_phase(train_output, test_output)
+        else:
+            if self.test_mode == 'viterbi':
+                train_output = self.get_marginal_prob(X, mask)
+                out = K.in_train_phase(train_output, test_output)
+            else:
+                out = test_output
+        return out
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[:2] + (self.units,)
+
+    def compute_mask(self, input, mask=None):
+        if mask is not None and self.learn_mode == 'join':
+            return K.any(mask, axis=1)
+        return mask
+
+    def get_config(self):
+        config = {
+            'units': self.units,
+            'learn_mode': self.learn_mode,
+            'test_mode': self.test_mode,
+            'use_boundary': self.use_boundary,
+            'use_bias': self.use_bias,
+            'sparse_target': self.sparse_target,
+            'kernel_initializer': initializers.serialize(self.kernel_initializer),
+            'chain_initializer': initializers.serialize(self.chain_initializer),
+            'boundary_initializer': initializers.serialize(
+                self.boundary_initializer),
+            'bias_initializer': initializers.serialize(self.bias_initializer),
+            'activation': activations.serialize(self.activation),
+            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+            'chain_regularizer': regularizers.serialize(self.chain_regularizer),
+            'boundary_regularizer': regularizers.serialize(
+                self.boundary_regularizer),
+            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+            'kernel_constraint': constraints.serialize(self.kernel_constraint),
+            'chain_constraint': constraints.serialize(self.chain_constraint),
+            'boundary_constraint': constraints.serialize(self.boundary_constraint),
+            'bias_constraint': constraints.serialize(self.bias_constraint),
+            'input_dim': self.input_dim,
+            'unroll': self.unroll}
+        base_config = super(CRF, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @property
+    def loss_function(self):
+        warnings.warn('CRF.loss_function is deprecated '
+                      'and it might be removed in the future. Please '
+                      'use losses.crf_loss instead.')
+        return crf_loss
+
+    @property
+    def accuracy(self):
+        warnings.warn('CRF.accuracy is deprecated and it '
+                      'might be removed in the future. Please '
+                      'use metrics.crf_accuracy')
+        if self.test_mode == 'viterbi':
+            return crf_viterbi_accuracy
+        else:
+            return crf_marginal_accuracy
+
+    @property
+    def viterbi_acc(self):
+        warnings.warn('CRF.viterbi_acc is deprecated and it might '
+                      'be removed in the future. Please '
+                      'use metrics.viterbi_acc instead.')
+        return crf_viterbi_accuracy
+
+    @property
+    def marginal_acc(self):
+        warnings.warn('CRF.moarginal_acc is deprecated and it '
+                      'might be removed in the future. Please '
+                      'use metrics.marginal_acc instead.')
+        return crf_marginal_accuracy
+
+    @staticmethod
+    def softmaxNd(x, axis=-1):
+        m = K.max(x, axis=axis, keepdims=True)
+        exp_x = K.exp(x - m)
+        prob_x = exp_x / K.sum(exp_x, axis=axis, keepdims=True)
+        return prob_x
+
+    @staticmethod
+    def shift_left(x, offset=1):
+        assert offset > 0
+        return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)
+
+    @staticmethod
+    def shift_right(x, offset=1):
+        assert offset > 0
+        return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)
+
+    def add_boundary_energy(self, energy, mask, start, end):
+        start = K.expand_dims(K.expand_dims(start, 0), 0)
+        end = K.expand_dims(K.expand_dims(end, 0), 0)
+        if mask is None:
+            energy = K.concatenate([energy[:, :1, :] + start, energy[:, 1:, :]],
+                                   axis=1)
+            energy = K.concatenate([energy[:, :-1, :], energy[:, -1:, :] + end],
+                                   axis=1)
+        else:
+            mask = K.expand_dims(K.cast(mask, K.floatx()))
+            start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
+            end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
+            energy = energy + start_mask * start
+            energy = energy + end_mask * end
+        return energy
+
+    def get_log_normalization_constant(self, input_energy, mask, **kwargs):
+        """Compute logarithm of the normalization constant Z, where
+        Z = sum exp(-E) -> logZ = log sum exp(-E) =: -nlogZ
+        """
+        # should have logZ[:, i] == logZ[:, j] for any i, j
+        logZ = self.recursion(input_energy, mask, return_sequences=False, **kwargs)
+        return logZ[:, 0]
+
+    def get_energy(self, y_true, input_energy, mask):
+        """Energy = a1' y1 + u1' y1 + y1' U y2 + u2' y2 + y2' U y3 + u3' y3 + an' y3
+        """
+        input_energy = K.sum(input_energy * y_true, 2)  # (B, T)
+        # (B, T-1)
+        chain_energy = K.sum(K.dot(y_true[:, :-1, :],
+                                   self.chain_kernel) * y_true[:, 1:, :], 2)
+
+        if mask is not None:
+            mask = K.cast(mask, K.floatx())
+            # (B, T-1), mask[:,:-1]*mask[:,1:] makes it work with any padding
+            chain_mask = mask[:, :-1] * mask[:, 1:]
+            input_energy = input_energy * mask
+            chain_energy = chain_energy * chain_mask
+        total_energy = K.sum(input_energy, -1) + K.sum(chain_energy, -1)  # (B, )
+
+        return total_energy
+
+    def get_negative_log_likelihood(self, y_true, X, mask):
+        """Compute the loss, i.e., negative log likelihood (normalize by number of time steps)
+           likelihood = 1/Z * exp(-E) ->  neg_log_like = - log(1/Z * exp(-E)) = logZ + E
+        """
+        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
+        if self.use_boundary:
+            input_energy = self.add_boundary_energy(input_energy, mask,
+                                                    self.left_boundary,
+                                                    self.right_boundary)
+        energy = self.get_energy(y_true, input_energy, mask)
+        logZ = self.get_log_normalization_constant(input_energy, mask,
+                                                   input_length=K.int_shape(X)[1])
+        nloglik = logZ + energy
+        if mask is not None:
+            nloglik = nloglik / K.sum(K.cast(mask, K.floatx()), 1)
+        else:
+            nloglik = nloglik / K.cast(K.shape(X)[1], K.floatx())
+        return nloglik
+
+    def step(self, input_energy_t, states, return_logZ=True):
+        # not in the following  `prev_target_val` has shape = (B, F)
+        # where B = batch_size, F = output feature dim
+        # Note: `i` is of float32, due to the behavior of `K.rnn`
+        prev_target_val, i, chain_energy = states[:3]
+        t = K.cast(i[0, 0], dtype='int32')
+        if len(states) > 3:
+            if K.backend() == 'theano':
+                m = states[3][:, t:(t + 2)]
+            else:
+                m = K.slice(states[3], [0, t], [-1, 2])
+            input_energy_t = input_energy_t * K.expand_dims(m[:, 0])
+            # (1, F, F)*(B, 1, 1) -> (B, F, F)
+            chain_energy = chain_energy * K.expand_dims(
+                K.expand_dims(m[:, 0] * m[:, 1]))
+        if return_logZ:
+            # shapes: (1, B, F) + (B, F, 1) -> (B, F, F)
+            energy = chain_energy + K.expand_dims(input_energy_t - prev_target_val, 2)
+            new_target_val = K.logsumexp(-energy, 1)  # shapes: (B, F)
+            return new_target_val, [new_target_val, i + 1]
+        else:
+            energy = chain_energy + K.expand_dims(input_energy_t + prev_target_val, 2)
+            min_energy = K.min(energy, 1)
+            # cast for tf-version `K.rnn
+            argmin_table = K.cast(K.argmin(energy, 1), K.floatx())
+            return argmin_table, [min_energy, i + 1]
+
+    def recursion(self, input_energy, mask=None, go_backwards=False,
+                  return_sequences=True, return_logZ=True, input_length=None):
+        """Forward (alpha) or backward (beta) recursion
+        If `return_logZ = True`, compute the logZ, the normalization constant:
+        \[ Z = \sum_{y1, y2, y3} exp(-E) # energy
+          = \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3))
+          = sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3))
+          sum_{y1} exp(-(u1' y1' + y1' W y2))) \]
+        Denote:
+            \[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \]
+            \[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \]
+            \[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \]
+        Note that:
+              yi's are one-hot vectors
+              u1, u3: boundary energies have been merged
+        If `return_logZ = False`, compute the Viterbi's best path lookup table.
+        """
+        chain_energy = self.chain_kernel
+        # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t
+        chain_energy = K.expand_dims(chain_energy, 0)
+        # shape=(B, F), dtype=float32
+        prev_target_val = K.zeros_like(input_energy[:, 0, :])
+
+        if go_backwards:
+            input_energy = K.reverse(input_energy, 1)
+            if mask is not None:
+                mask = K.reverse(mask, 1)
+
+        initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])]
+        constants = [chain_energy]
+
+        if mask is not None:
+            mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
+                           K.floatx())
+            constants.append(mask2)
+
+        def _step(input_energy_i, states):
+            return self.step(input_energy_i, states, return_logZ)
+
+        target_val_last, target_val_seq, _ = K.rnn(_step, input_energy,
+                                                   initial_states,
+                                                   constants=constants,
+                                                   input_length=input_length,
+                                                   unroll=self.unroll)
+
+        if return_sequences:
+            if go_backwards:
+                target_val_seq = K.reverse(target_val_seq, 1)
+            return target_val_seq
+        else:
+            return target_val_last
+
+    def forward_recursion(self, input_energy, **kwargs):
+        return self.recursion(input_energy, **kwargs)
+
+    def backward_recursion(self, input_energy, **kwargs):
+        return self.recursion(input_energy, go_backwards=True, **kwargs)
+
+    def get_marginal_prob(self, X, mask=None):
+        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
+        if self.use_boundary:
+            input_energy = self.add_boundary_energy(input_energy, mask,
+                                                    self.left_boundary,
+                                                    self.right_boundary)
+        input_length = K.int_shape(X)[1]
+        alpha = self.forward_recursion(input_energy, mask=mask,
+                                       input_length=input_length)
+        beta = self.backward_recursion(input_energy, mask=mask,
+                                       input_length=input_length)
+        if mask is not None:
+            input_energy = input_energy * K.expand_dims(K.cast(mask, K.floatx()))
+        margin = -(self.shift_right(alpha) + input_energy + self.shift_left(beta))
+        return self.softmaxNd(margin)
+
+    def viterbi_decoding(self, X, mask=None):
+        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
+        if self.use_boundary:
+            input_energy = self.add_boundary_energy(
+                input_energy, mask, self.left_boundary, self.right_boundary)
+
+        argmin_tables = self.recursion(input_energy, mask, return_logZ=False)
+        argmin_tables = K.cast(argmin_tables, 'int32')
+
+        # backward to find best path, `initial_best_idx` can be any,
+        # as all elements in the last argmin_table are the same
+        argmin_tables = K.reverse(argmin_tables, 1)
+        # matrix instead of vector is required by tf `K.rnn`
+        initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])]
+        if K.backend() == 'theano':
+            from theano import tensor as T
+            initial_best_idx = [T.unbroadcast(initial_best_idx[0], 1)]
+
+        def gather_each_row(params, indices):
+            n = K.shape(indices)[0]
+            if K.backend() == 'theano':
+                from theano import tensor as T
+                return params[T.arange(n), indices]
+            elif K.backend() == 'tensorflow':
+                import tensorflow as tf
+                indices = K.transpose(K.stack([tf.range(n), indices]))
+                return tf.gather_nd(params, indices)
+            else:
+                raise NotImplementedError
+
+        def find_path(argmin_table, best_idx):
+            next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0])
+            next_best_idx = K.expand_dims(next_best_idx)
+            if K.backend() == 'theano':
+                from theano import tensor as T
+                next_best_idx = T.unbroadcast(next_best_idx, 1)
+            return next_best_idx, [next_best_idx]
+
+        _, best_paths, _ = K.rnn(find_path, argmin_tables, initial_best_idx,
+                                 input_length=K.int_shape(X)[1], unroll=self.unroll)
+        best_paths = K.reverse(best_paths, 1)
+        best_paths = K.squeeze(best_paths, 2)
+
+        return K.one_hot(best_paths, self.units)
\ No newline at end of file
diff --git a/tensorflow_addons/losses/crf_losses.py b/tensorflow_addons/losses/crf_losses.py
new file mode 100644
index 0000000000..5729f7eb60
--- /dev/null
+++ b/tensorflow_addons/losses/crf_losses.py
@@ -0,0 +1,53 @@
+from keras import backend as K
+from keras.losses import categorical_crossentropy
+from keras.losses import sparse_categorical_crossentropy
+
+
+def crf_nll(y_true, y_pred):
+    """The negative log-likelihood for linear chain Conditional Random Field (CRF).
+    This loss function is only used when the `layers.CRF` layer
+    is trained in the "join" mode.
+    # Arguments
+        y_true: tensor with true targets.
+        y_pred: tensor with predicted targets.
+    # Returns
+        A scalar representing corresponding to the negative log-likelihood.
+    # Raises
+        TypeError: If CRF is not the last layer.
+    # About GitHub
+        If you open an issue or a pull request about CRF, please
+        add `cc @lzfelix` to notify Luiz Felix.
+    """
+
+    crf, idx = y_pred._keras_history[:2]
+    if crf._outbound_nodes:
+        raise TypeError('When learn_model="join", CRF must be the last layer.')
+    if crf.sparse_target:
+        y_true = K.one_hot(K.cast(y_true[:, :, 0], 'int32'), crf.units)
+    X = crf._inbound_nodes[idx].input_tensors[0]
+    mask = crf._inbound_nodes[idx].input_masks[0]
+    nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
+    return nloglik
+
+
+def crf_loss(y_true, y_pred):
+    """General CRF loss function depending on the learning mode.
+    # Arguments
+        y_true: tensor with true targets.
+        y_pred: tensor with predicted targets.
+    # Returns
+        If the CRF layer is being trained in the join mode, returns the negative
+        log-likelihood. Otherwise returns the categorical crossentropy implemented
+        by the underlying Keras backend.
+    # About GitHub
+        If you open an issue or a pull request about CRF, please
+        add `cc @lzfelix` to notify Luiz Felix.
+    """
+    crf, idx = y_pred._keras_history[:2]
+    if crf.learn_mode == 'join':
+        return crf_nll(y_true, y_pred)
+    else:
+        if crf.sparse_target:
+            return sparse_categorical_crossentropy(y_true, y_pred)
+        else:
+            return categorical_crossentropy(y_true, y_pred)
diff --git a/tensorflow_addons/metrics/crf_accuracy.py b/tensorflow_addons/metrics/crf_accuracy.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_addons/metrics/marginal_acc.py b/tensorflow_addons/metrics/marginal_acc.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_addons/metrics/viterbi_acc.py b/tensorflow_addons/metrics/viterbi_acc.py
new file mode 100644
index 0000000000..e69de29bb2

From a0b6b7d330d7a9f166139bb6ed3b687436196e76 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Mon, 22 Jul 2019 18:45:45 +0800
Subject: [PATCH 11/52] remove useless file

---
 tensorflow_addons/layers/old_crf.py | 573 ----------------------------
 1 file changed, 573 deletions(-)
 delete mode 100644 tensorflow_addons/layers/old_crf.py

diff --git a/tensorflow_addons/layers/old_crf.py b/tensorflow_addons/layers/old_crf.py
deleted file mode 100644
index 180f97c132..0000000000
--- a/tensorflow_addons/layers/old_crf.py
+++ /dev/null
@@ -1,573 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-
-import warnings
-
-from keras import backend as K
-from keras import activations
-from keras import initializers
-from keras import regularizers
-from keras import constraints
-from keras.layers import Layer
-from keras.layers import InputSpec
-
-from keras_contrib.losses import crf_loss
-from keras_contrib.metrics import crf_marginal_accuracy
-from keras_contrib.metrics import crf_viterbi_accuracy
-from keras_contrib.utils.test_utils import to_tuple
-
-
-class CRF(Layer):
-    """An implementation of linear chain conditional random field (CRF).
-    An linear chain CRF is defined to maximize the following likelihood function:
-    $$ L(W, U, b; y_1, ..., y_n) := \frac{1}{Z}
-    \sum_{y_1, ..., y_n} \exp(-a_1' y_1 - a_n' y_n
-        - \sum_{k=1^n}((f(x_k' W + b) y_k) + y_1' U y_2)), $$
-    where:
-        $Z$: normalization constant
-        $x_k, y_k$:  inputs and outputs
-    This implementation has two modes for optimization:
-    1. (`join mode`) optimized by maximizing join likelihood,
-    which is optimal in theory of statistics.
-       Note that in this case, CRF must be the output/last layer.
-    2. (`marginal mode`) return marginal probabilities on each time
-    step and optimized via composition
-       likelihood (product of marginal likelihood), i.e.,
-       using `categorical_crossentropy` loss.
-       Note that in this case, CRF can be either the last layer or an
-       intermediate layer (though not explored).
-    For prediction (test phrase), one can choose either Viterbi
-    best path (class indices) or marginal
-    probabilities if probabilities are needed.
-    However, if one chooses *join mode* for training,
-    Viterbi output is typically better than marginal output,
-    but the marginal output will still perform
-    reasonably close, while if *marginal mode* is used for training,
-    marginal output usually performs
-    much better. The default behavior and `metrics.crf_accuracy`
-    is set according to this observation.
-    In addition, this implementation supports masking and accepts either
-    onehot or sparse target.
-    If you open a issue or a pull request about CRF, please
-    add 'cc @lzfelix' to notify Luiz Felix.
-    # Examples
-    ```python
-        from keras_contrib.layers import CRF
-        from keras_contrib.losses import crf_loss
-        from keras_contrib.metrics import crf_viterbi_accuracy
-        model = Sequential()
-        model.add(Embedding(3001, 300, mask_zero=True)(X)
-        # use learn_mode = 'join', test_mode = 'viterbi',
-        # sparse_target = True (label indice output)
-        crf = CRF(10, sparse_target=True)
-        model.add(crf)
-        # crf_accuracy is default to Viterbi acc if using join-mode (default).
-        # One can add crf.marginal_acc if interested, but may slow down learning
-        model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
-        # y must be label indices (with shape 1 at dim 3) here,
-        # since `sparse_target=True`
-        model.fit(x, y)
-        # prediction give onehot representation of Viterbi best path
-        y_hat = model.predict(x_test)
-    ```
-    The following snippet shows how to load a persisted
-    model that uses the CRF layer:
-    ```python
-        from keras.models import load_model
-        from keras_contrib.losses import import crf_loss
-        from keras_contrib.metrics import crf_viterbi_accuracy
-        custom_objects={'CRF': CRF,
-                        'crf_loss': crf_loss,
-                        'crf_viterbi_accuracy': crf_viterbi_accuracy}
-        loaded_model = load_model('<path_to_model>',
-                                  custom_objects=custom_objects)
-    ```
-    # Arguments
-        units: Positive integer, dimensionality of the output space.
-        learn_mode: Either 'join' or 'marginal'.
-            The former train the model by maximizing join likelihood while the latter
-            maximize the product of marginal likelihood over all time steps.
-            One should use `losses.crf_nll` for 'join' mode
-            and `losses.categorical_crossentropy` or
-            `losses.sparse_categorical_crossentropy` for
-            `marginal` mode.  For convenience, simply
-            use `losses.crf_loss`, which will decide the proper loss as described.
-        test_mode: Either 'viterbi' or 'marginal'.
-            The former is recommended and as default when `learn_mode = 'join'` and
-            gives one-hot representation of the best path at test (prediction) time,
-            while the latter is recommended and chosen as default
-            when `learn_mode = 'marginal'`,
-            which produces marginal probabilities for each time step.
-            For evaluating metrics, one should
-            use `metrics.crf_viterbi_accuracy` for 'viterbi' mode and
-            'metrics.crf_marginal_accuracy' for 'marginal' mode, or
-            simply use `metrics.crf_accuracy` for
-            both which automatically decides it as described.
-            One can also use both for evaluation at training.
-        sparse_target: Boolean (default False) indicating
-            if provided labels are one-hot or
-            indices (with shape 1 at dim 3).
-        use_boundary: Boolean (default True) indicating if trainable
-            start-end chain energies
-            should be added to model.
-        use_bias: Boolean, whether the layer uses a bias vector.
-        kernel_initializer: Initializer for the `kernel` weights matrix,
-            used for the linear transformation of the inputs.
-            (see [initializers](../initializers.md)).
-        chain_initializer: Initializer for the `chain_kernel` weights matrix,
-            used for the CRF chain energy.
-            (see [initializers](../initializers.md)).
-        boundary_initializer: Initializer for the `left_boundary`,
-            'right_boundary' weights vectors,
-            used for the start/left and end/right boundary energy.
-            (see [initializers](../initializers.md)).
-        bias_initializer: Initializer for the bias vector
-            (see [initializers](../initializers.md)).
-        activation: Activation function to use
-            (see [activations](../activations.md)).
-            If you pass None, no activation is applied
-            (ie. "linear" activation: `a(x) = x`).
-        kernel_regularizer: Regularizer function applied to
-            the `kernel` weights matrix
-            (see [regularizer](../regularizers.md)).
-        chain_regularizer: Regularizer function applied to
-            the `chain_kernel` weights matrix
-            (see [regularizer](../regularizers.md)).
-        boundary_regularizer: Regularizer function applied to
-            the 'left_boundary', 'right_boundary' weight vectors
-            (see [regularizer](../regularizers.md)).
-        bias_regularizer: Regularizer function applied to the bias vector
-            (see [regularizer](../regularizers.md)).
-        kernel_constraint: Constraint function applied to
-            the `kernel` weights matrix
-            (see [constraints](../constraints.md)).
-        chain_constraint: Constraint function applied to
-            the `chain_kernel` weights matrix
-            (see [constraints](../constraints.md)).
-        boundary_constraint: Constraint function applied to
-            the `left_boundary`, `right_boundary` weights vectors
-            (see [constraints](../constraints.md)).
-        bias_constraint: Constraint function applied to the bias vector
-            (see [constraints](../constraints.md)).
-        input_dim: dimensionality of the input (integer).
-            This argument (or alternatively, the keyword argument `input_shape`)
-            is required when using this layer as the first layer in a model.
-        unroll: Boolean (default False). If True, the network will be
-            unrolled, else a symbolic loop will be used.
-            Unrolling can speed-up a RNN, although it tends
-            to be more memory-intensive.
-            Unrolling is only suitable for short sequences.
-    # Input shape
-        3D tensor with shape `(nb_samples, timesteps, input_dim)`.
-    # Output shape
-        3D tensor with shape `(nb_samples, timesteps, units)`.
-    # Masking
-        This layer supports masking for input data with a variable number
-        of timesteps. To introduce masks to your data,
-        use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
-        set to `True`.
-    """
-
-    def __init__(self, units,
-                 learn_mode='join',
-                 test_mode=None,
-                 sparse_target=False,
-                 use_boundary=True,
-                 use_bias=True,
-                 activation='linear',
-                 kernel_initializer='glorot_uniform',
-                 chain_initializer='orthogonal',
-                 bias_initializer='zeros',
-                 boundary_initializer='zeros',
-                 kernel_regularizer=None,
-                 chain_regularizer=None,
-                 boundary_regularizer=None,
-                 bias_regularizer=None,
-                 kernel_constraint=None,
-                 chain_constraint=None,
-                 boundary_constraint=None,
-                 bias_constraint=None,
-                 input_dim=None,
-                 unroll=False,
-                 **kwargs):
-        super(CRF, self).__init__(**kwargs)
-        self.supports_masking = True
-        self.units = units
-        self.learn_mode = learn_mode
-        assert self.learn_mode in ['join', 'marginal']
-        self.test_mode = test_mode
-        if self.test_mode is None:
-            self.test_mode = 'viterbi' if self.learn_mode == 'join' else 'marginal'
-        else:
-            assert self.test_mode in ['viterbi', 'marginal']
-        self.sparse_target = sparse_target
-        self.use_boundary = use_boundary
-        self.use_bias = use_bias
-
-        self.activation = activations.get(activation)
-
-        self.kernel_initializer = initializers.get(kernel_initializer)
-        self.chain_initializer = initializers.get(chain_initializer)
-        self.boundary_initializer = initializers.get(boundary_initializer)
-        self.bias_initializer = initializers.get(bias_initializer)
-
-        self.kernel_regularizer = regularizers.get(kernel_regularizer)
-        self.chain_regularizer = regularizers.get(chain_regularizer)
-        self.boundary_regularizer = regularizers.get(boundary_regularizer)
-        self.bias_regularizer = regularizers.get(bias_regularizer)
-
-        self.kernel_constraint = constraints.get(kernel_constraint)
-        self.chain_constraint = constraints.get(chain_constraint)
-        self.boundary_constraint = constraints.get(boundary_constraint)
-        self.bias_constraint = constraints.get(bias_constraint)
-
-        self.unroll = unroll
-
-    def build(self, input_shape):
-        input_shape = to_tuple(input_shape)
-        self.input_spec = [InputSpec(shape=input_shape)]
-        self.input_dim = input_shape[-1]
-
-        self.kernel = self.add_weight(shape=(self.input_dim, self.units),
-                                      name='kernel',
-                                      initializer=self.kernel_initializer,
-                                      regularizer=self.kernel_regularizer,
-                                      constraint=self.kernel_constraint)
-        self.chain_kernel = self.add_weight(shape=(self.units, self.units),
-                                            name='chain_kernel',
-                                            initializer=self.chain_initializer,
-                                            regularizer=self.chain_regularizer,
-                                            constraint=self.chain_constraint)
-        if self.use_bias:
-            self.bias = self.add_weight(shape=(self.units,),
-                                        name='bias',
-                                        initializer=self.bias_initializer,
-                                        regularizer=self.bias_regularizer,
-                                        constraint=self.bias_constraint)
-        else:
-            self.bias = 0
-
-        if self.use_boundary:
-            self.left_boundary = self.add_weight(shape=(self.units,),
-                                                 name='left_boundary',
-                                                 initializer=self.boundary_initializer,
-                                                 regularizer=self.boundary_regularizer,
-                                                 constraint=self.boundary_constraint)
-            self.right_boundary = self.add_weight(shape=(self.units,),
-                                                  name='right_boundary',
-                                                  initializer=self.boundary_initializer,
-                                                  regularizer=self.boundary_regularizer,
-                                                  constraint=self.boundary_constraint)
-        self.built = True
-
-    def call(self, X, mask=None):
-        if mask is not None:
-            assert K.ndim(mask) == 2, 'Input mask to CRF must have dim 2 if not None'
-
-        if self.test_mode == 'viterbi':
-            test_output = self.viterbi_decoding(X, mask)
-        else:
-            test_output = self.get_marginal_prob(X, mask)
-
-        self.uses_learning_phase = True
-        if self.learn_mode == 'join':
-            train_output = K.zeros_like(K.dot(X, self.kernel))
-            out = K.in_train_phase(train_output, test_output)
-        else:
-            if self.test_mode == 'viterbi':
-                train_output = self.get_marginal_prob(X, mask)
-                out = K.in_train_phase(train_output, test_output)
-            else:
-                out = test_output
-        return out
-
-    def compute_output_shape(self, input_shape):
-        return input_shape[:2] + (self.units,)
-
-    def compute_mask(self, input, mask=None):
-        if mask is not None and self.learn_mode == 'join':
-            return K.any(mask, axis=1)
-        return mask
-
-    def get_config(self):
-        config = {
-            'units': self.units,
-            'learn_mode': self.learn_mode,
-            'test_mode': self.test_mode,
-            'use_boundary': self.use_boundary,
-            'use_bias': self.use_bias,
-            'sparse_target': self.sparse_target,
-            'kernel_initializer': initializers.serialize(self.kernel_initializer),
-            'chain_initializer': initializers.serialize(self.chain_initializer),
-            'boundary_initializer': initializers.serialize(
-                self.boundary_initializer),
-            'bias_initializer': initializers.serialize(self.bias_initializer),
-            'activation': activations.serialize(self.activation),
-            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-            'chain_regularizer': regularizers.serialize(self.chain_regularizer),
-            'boundary_regularizer': regularizers.serialize(
-                self.boundary_regularizer),
-            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'kernel_constraint': constraints.serialize(self.kernel_constraint),
-            'chain_constraint': constraints.serialize(self.chain_constraint),
-            'boundary_constraint': constraints.serialize(self.boundary_constraint),
-            'bias_constraint': constraints.serialize(self.bias_constraint),
-            'input_dim': self.input_dim,
-            'unroll': self.unroll}
-        base_config = super(CRF, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    @property
-    def loss_function(self):
-        warnings.warn('CRF.loss_function is deprecated '
-                      'and it might be removed in the future. Please '
-                      'use losses.crf_loss instead.')
-        return crf_loss
-
-    @property
-    def accuracy(self):
-        warnings.warn('CRF.accuracy is deprecated and it '
-                      'might be removed in the future. Please '
-                      'use metrics.crf_accuracy')
-        if self.test_mode == 'viterbi':
-            return crf_viterbi_accuracy
-        else:
-            return crf_marginal_accuracy
-
-    @property
-    def viterbi_acc(self):
-        warnings.warn('CRF.viterbi_acc is deprecated and it might '
-                      'be removed in the future. Please '
-                      'use metrics.viterbi_acc instead.')
-        return crf_viterbi_accuracy
-
-    @property
-    def marginal_acc(self):
-        warnings.warn('CRF.moarginal_acc is deprecated and it '
-                      'might be removed in the future. Please '
-                      'use metrics.marginal_acc instead.')
-        return crf_marginal_accuracy
-
-    @staticmethod
-    def softmaxNd(x, axis=-1):
-        m = K.max(x, axis=axis, keepdims=True)
-        exp_x = K.exp(x - m)
-        prob_x = exp_x / K.sum(exp_x, axis=axis, keepdims=True)
-        return prob_x
-
-    @staticmethod
-    def shift_left(x, offset=1):
-        assert offset > 0
-        return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)
-
-    @staticmethod
-    def shift_right(x, offset=1):
-        assert offset > 0
-        return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)
-
-    def add_boundary_energy(self, energy, mask, start, end):
-        start = K.expand_dims(K.expand_dims(start, 0), 0)
-        end = K.expand_dims(K.expand_dims(end, 0), 0)
-        if mask is None:
-            energy = K.concatenate([energy[:, :1, :] + start, energy[:, 1:, :]],
-                                   axis=1)
-            energy = K.concatenate([energy[:, :-1, :], energy[:, -1:, :] + end],
-                                   axis=1)
-        else:
-            mask = K.expand_dims(K.cast(mask, K.floatx()))
-            start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
-            end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
-            energy = energy + start_mask * start
-            energy = energy + end_mask * end
-        return energy
-
-    def get_log_normalization_constant(self, input_energy, mask, **kwargs):
-        """Compute logarithm of the normalization constant Z, where
-        Z = sum exp(-E) -> logZ = log sum exp(-E) =: -nlogZ
-        """
-        # should have logZ[:, i] == logZ[:, j] for any i, j
-        logZ = self.recursion(input_energy, mask, return_sequences=False, **kwargs)
-        return logZ[:, 0]
-
-    def get_energy(self, y_true, input_energy, mask):
-        """Energy = a1' y1 + u1' y1 + y1' U y2 + u2' y2 + y2' U y3 + u3' y3 + an' y3
-        """
-        input_energy = K.sum(input_energy * y_true, 2)  # (B, T)
-        # (B, T-1)
-        chain_energy = K.sum(K.dot(y_true[:, :-1, :],
-                                   self.chain_kernel) * y_true[:, 1:, :], 2)
-
-        if mask is not None:
-            mask = K.cast(mask, K.floatx())
-            # (B, T-1), mask[:,:-1]*mask[:,1:] makes it work with any padding
-            chain_mask = mask[:, :-1] * mask[:, 1:]
-            input_energy = input_energy * mask
-            chain_energy = chain_energy * chain_mask
-        total_energy = K.sum(input_energy, -1) + K.sum(chain_energy, -1)  # (B, )
-
-        return total_energy
-
-    def get_negative_log_likelihood(self, y_true, X, mask):
-        """Compute the loss, i.e., negative log likelihood (normalize by number of time steps)
-           likelihood = 1/Z * exp(-E) ->  neg_log_like = - log(1/Z * exp(-E)) = logZ + E
-        """
-        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
-        if self.use_boundary:
-            input_energy = self.add_boundary_energy(input_energy, mask,
-                                                    self.left_boundary,
-                                                    self.right_boundary)
-        energy = self.get_energy(y_true, input_energy, mask)
-        logZ = self.get_log_normalization_constant(input_energy, mask,
-                                                   input_length=K.int_shape(X)[1])
-        nloglik = logZ + energy
-        if mask is not None:
-            nloglik = nloglik / K.sum(K.cast(mask, K.floatx()), 1)
-        else:
-            nloglik = nloglik / K.cast(K.shape(X)[1], K.floatx())
-        return nloglik
-
-    def step(self, input_energy_t, states, return_logZ=True):
-        # not in the following  `prev_target_val` has shape = (B, F)
-        # where B = batch_size, F = output feature dim
-        # Note: `i` is of float32, due to the behavior of `K.rnn`
-        prev_target_val, i, chain_energy = states[:3]
-        t = K.cast(i[0, 0], dtype='int32')
-        if len(states) > 3:
-            if K.backend() == 'theano':
-                m = states[3][:, t:(t + 2)]
-            else:
-                m = K.slice(states[3], [0, t], [-1, 2])
-            input_energy_t = input_energy_t * K.expand_dims(m[:, 0])
-            # (1, F, F)*(B, 1, 1) -> (B, F, F)
-            chain_energy = chain_energy * K.expand_dims(
-                K.expand_dims(m[:, 0] * m[:, 1]))
-        if return_logZ:
-            # shapes: (1, B, F) + (B, F, 1) -> (B, F, F)
-            energy = chain_energy + K.expand_dims(input_energy_t - prev_target_val, 2)
-            new_target_val = K.logsumexp(-energy, 1)  # shapes: (B, F)
-            return new_target_val, [new_target_val, i + 1]
-        else:
-            energy = chain_energy + K.expand_dims(input_energy_t + prev_target_val, 2)
-            min_energy = K.min(energy, 1)
-            # cast for tf-version `K.rnn
-            argmin_table = K.cast(K.argmin(energy, 1), K.floatx())
-            return argmin_table, [min_energy, i + 1]
-
-    def recursion(self, input_energy, mask=None, go_backwards=False,
-                  return_sequences=True, return_logZ=True, input_length=None):
-        """Forward (alpha) or backward (beta) recursion
-        If `return_logZ = True`, compute the logZ, the normalization constant:
-        \[ Z = \sum_{y1, y2, y3} exp(-E) # energy
-          = \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3))
-          = sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3))
-          sum_{y1} exp(-(u1' y1' + y1' W y2))) \]
-        Denote:
-            \[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \]
-            \[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \]
-            \[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \]
-        Note that:
-              yi's are one-hot vectors
-              u1, u3: boundary energies have been merged
-        If `return_logZ = False`, compute the Viterbi's best path lookup table.
-        """
-        chain_energy = self.chain_kernel
-        # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t
-        chain_energy = K.expand_dims(chain_energy, 0)
-        # shape=(B, F), dtype=float32
-        prev_target_val = K.zeros_like(input_energy[:, 0, :])
-
-        if go_backwards:
-            input_energy = K.reverse(input_energy, 1)
-            if mask is not None:
-                mask = K.reverse(mask, 1)
-
-        initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])]
-        constants = [chain_energy]
-
-        if mask is not None:
-            mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
-                           K.floatx())
-            constants.append(mask2)
-
-        def _step(input_energy_i, states):
-            return self.step(input_energy_i, states, return_logZ)
-
-        target_val_last, target_val_seq, _ = K.rnn(_step, input_energy,
-                                                   initial_states,
-                                                   constants=constants,
-                                                   input_length=input_length,
-                                                   unroll=self.unroll)
-
-        if return_sequences:
-            if go_backwards:
-                target_val_seq = K.reverse(target_val_seq, 1)
-            return target_val_seq
-        else:
-            return target_val_last
-
-    def forward_recursion(self, input_energy, **kwargs):
-        return self.recursion(input_energy, **kwargs)
-
-    def backward_recursion(self, input_energy, **kwargs):
-        return self.recursion(input_energy, go_backwards=True, **kwargs)
-
-    def get_marginal_prob(self, X, mask=None):
-        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
-        if self.use_boundary:
-            input_energy = self.add_boundary_energy(input_energy, mask,
-                                                    self.left_boundary,
-                                                    self.right_boundary)
-        input_length = K.int_shape(X)[1]
-        alpha = self.forward_recursion(input_energy, mask=mask,
-                                       input_length=input_length)
-        beta = self.backward_recursion(input_energy, mask=mask,
-                                       input_length=input_length)
-        if mask is not None:
-            input_energy = input_energy * K.expand_dims(K.cast(mask, K.floatx()))
-        margin = -(self.shift_right(alpha) + input_energy + self.shift_left(beta))
-        return self.softmaxNd(margin)
-
-    def viterbi_decoding(self, X, mask=None):
-        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
-        if self.use_boundary:
-            input_energy = self.add_boundary_energy(
-                input_energy, mask, self.left_boundary, self.right_boundary)
-
-        argmin_tables = self.recursion(input_energy, mask, return_logZ=False)
-        argmin_tables = K.cast(argmin_tables, 'int32')
-
-        # backward to find best path, `initial_best_idx` can be any,
-        # as all elements in the last argmin_table are the same
-        argmin_tables = K.reverse(argmin_tables, 1)
-        # matrix instead of vector is required by tf `K.rnn`
-        initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])]
-        if K.backend() == 'theano':
-            from theano import tensor as T
-            initial_best_idx = [T.unbroadcast(initial_best_idx[0], 1)]
-
-        def gather_each_row(params, indices):
-            n = K.shape(indices)[0]
-            if K.backend() == 'theano':
-                from theano import tensor as T
-                return params[T.arange(n), indices]
-            elif K.backend() == 'tensorflow':
-                import tensorflow as tf
-                indices = K.transpose(K.stack([tf.range(n), indices]))
-                return tf.gather_nd(params, indices)
-            else:
-                raise NotImplementedError
-
-        def find_path(argmin_table, best_idx):
-            next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0])
-            next_best_idx = K.expand_dims(next_best_idx)
-            if K.backend() == 'theano':
-                from theano import tensor as T
-                next_best_idx = T.unbroadcast(next_best_idx, 1)
-            return next_best_idx, [next_best_idx]
-
-        _, best_paths, _ = K.rnn(find_path, argmin_tables, initial_best_idx,
-                                 input_length=K.int_shape(X)[1], unroll=self.unroll)
-        best_paths = K.reverse(best_paths, 1)
-        best_paths = K.squeeze(best_paths, 2)
-
-        return K.one_hot(best_paths, self.units)
\ No newline at end of file

From d8d98a8eabbc991eacfd2bbb6fa7760427b58f99 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 30 Jul 2019 19:12:43 +0800
Subject: [PATCH 12/52] Update & bugfix

---
 tensorflow_addons/layers/BUILD            |   2 +
 tensorflow_addons/layers/README.md        |   2 +
 tensorflow_addons/layers/crf.py           | 313 +++++++++++++++++++++-
 tensorflow_addons/losses/BUILD            |   1 +
 tensorflow_addons/losses/README.md        |   3 +
 tensorflow_addons/losses/crf_losses.py    |  49 +---
 tensorflow_addons/metrics/BUILD           |   1 +
 tensorflow_addons/metrics/crf_accuracy.py |  51 ++++
 8 files changed, 372 insertions(+), 50 deletions(-)

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index e05719a245..5a143b9168 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -11,11 +11,13 @@ py_library(
         "poincare.py",
         "sparsemax.py",
         "wrappers.py",
+        "crf.py"
     ],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow_addons/activations",
         "//tensorflow_addons/utils",
+        "//tensorflow_addons/text",
     ],
 )
 
diff --git a/tensorflow_addons/layers/README.md b/tensorflow_addons/layers/README.md
index 94cf0b55d8..9f8bc6cc45 100644
--- a/tensorflow_addons/layers/README.md
+++ b/tensorflow_addons/layers/README.md
@@ -8,6 +8,7 @@
 | poincare |  |  |
 | sparsemax | @AndreasMadsen | amwwebdk+github@gmail.com |
 | wrappers | @seanpmorgan | seanmorgan@outlook.com |
+| crf | @howl-anderson | u1mail2me@gmail.com |
 
 ## Components
 | Submodule  | Layer |  Reference  |
@@ -18,6 +19,7 @@
 | poincare | PoincareNormalize | https://arxiv.org/abs/1705.08039    |
 | sparsemax| Sparsemax | https://arxiv.org/abs/1602.02068 |
 | wrappers | WeightNormalization | https://arxiv.org/abs/1602.07868 |
+| crf | CRF | https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers |
 
 ## Contribution Guidelines
 #### Standard API
diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 7c0dd99c5b..67a72b4019 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -1,28 +1,122 @@
 import tensorflow as tf
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import initializers, regularizers, constraints, \
+    activations
+from tensorflow.python.keras.layers import InputSpec, Layer
+from tensorflow_addons.text.crf import crf_decode, crf_log_likelihood
+from tensorflow_addons.utils import keras_utils
 
-from tensorflow_addons.text.crf import crf_decode
+"""
+TODO
 
+* learn_mode is not supported
+* test_mode is not supported
+* sparse_target is not supported
+* use_boundary need test
+* input_dim is not know how to use
+* unroll is not supported
+
+* left padding of mask is not supported
+
+* not test yet if CRF is the first layer
+"""
+
+
+@keras_utils.register_keras_custom_object
+class CRF(Layer):
+    def __init__(self, units,
+                 learn_mode='join',
+                 test_mode=None,
+                 sparse_target=False,
+                 use_boundary=False,
+                 use_bias=True,
+                 activation='linear',
+                 kernel_initializer='glorot_uniform',
+                 chain_initializer='orthogonal',
+                 bias_initializer='zeros',
+                 boundary_initializer='zeros',
+                 kernel_regularizer=None,
+                 chain_regularizer=None,
+                 boundary_regularizer=None,
+                 bias_regularizer=None,
+                 kernel_constraint=None,
+                 chain_constraint=None,
+                 boundary_constraint=None,
+                 bias_constraint=None,
+                 input_dim=None,
+                 unroll=False,
+                 **kwargs):
+        super(CRF, self).__init__(**kwargs)
+
+        # setup mask supporting flag, used by base class (the Layer)
+        self.supports_masking = True
 
-class CRF(tf.keras.layers.Layer):
-    def __init__(self, units):
-        super(CRF, self).__init__()
         self.units = units  # numbers of tags
 
+        self.learn_mode = learn_mode
+        assert self.learn_mode in ['join', 'marginal']
+
+        self.test_mode = test_mode
+        if self.test_mode is None:
+            self.test_mode = 'viterbi' if self.learn_mode == 'join' else 'marginal'
+        else:
+            assert self.test_mode in ['viterbi', 'marginal']
+        self.sparse_target = sparse_target
+        self.use_boundary = use_boundary
+        self.use_bias = use_bias
+
+        self.activation = activations.get(activation)
+
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.chain_initializer = initializers.get(chain_initializer)
+        self.boundary_initializer = initializers.get(boundary_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
+
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.chain_regularizer = regularizers.get(chain_regularizer)
+        self.boundary_regularizer = regularizers.get(boundary_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
+
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.chain_constraint = constraints.get(chain_constraint)
+        self.boundary_constraint = constraints.get(boundary_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
+
+        self.input_dim = input_dim
+        self.unroll = unroll
+
+        # value remembered for loss/metrics function
+        self.logits = None
+        self.nwords = None
+        self.mask = None
+
+        # global variable
+        self.kernel = None
+        self.chain_kernel = None
+        self.bias = None
+        self.left_boundary = None
+        self.right_boundary = None
+
     def build(self, input_shape):
+        input_shape = tuple(tf.TensorShape(input_shape).as_list())
+        self.input_spec = [InputSpec(shape=input_shape)]
         self.input_dim = input_shape[-1]
 
+        # weights that mapping arbitrary tensor to correct shape
         self.kernel = self.add_weight(shape=(self.input_dim, self.units),
                                       name='kernel',
                                       initializer=self.kernel_initializer,
                                       regularizer=self.kernel_regularizer,
                                       constraint=self.kernel_constraint)
 
+        # weights that work as transfer probability of each tags
         self.chain_kernel = self.add_weight(shape=(self.units, self.units),
                                             name='chain_kernel',
                                             initializer=self.chain_initializer,
                                             regularizer=self.chain_regularizer,
                                             constraint=self.chain_constraint)
 
+        # bias that works with self.kernel
         if self.use_bias:
             self.bias = self.add_weight(shape=(self.units,),
                                         name='bias',
@@ -32,16 +126,209 @@ def build(self, input_shape):
         else:
             self.bias = 0
 
-    def call(self, input, **kwargs):
-        logits = self._dense_layer(input)
+        # weight of <START> to tag probability and tag to <END> probability
+        if self.use_boundary:
+            self.left_boundary = self.add_weight(shape=(self.units,),
+                                                 name='left_boundary',
+                                                 initializer=self.boundary_initializer,
+                                                 regularizer=self.boundary_regularizer,
+                                                 constraint=self.boundary_constraint)
+            self.right_boundary = self.add_weight(shape=(self.units,),
+                                                  name='right_boundary',
+                                                  initializer=self.boundary_initializer,
+                                                  regularizer=self.boundary_regularizer,
+                                                  constraint=self.boundary_constraint)
+
+        # or directly call self.built = True
+        super(CRF, self).build(input_shape)
+
+    def call(self, inputs, mask=None, **kwargs):
+        # mask: Tensor(shape=(?, ?), dtype=bool) or None
+
+        if mask is not None:
+            assert K.ndim(mask) == 2, 'Input mask to CRF must have dim 2 if not None'
+
+        # remember this value for later use
+        self.mask = mask
+
+        logits = self._dense_layer(inputs)
+
+        # appending boundary probability info
+        if self.use_boundary:
+            logits = self.add_boundary_energy(
+                logits, mask, self.left_boundary, self.right_boundary)
+
+        # remember this value for later use
+        self.logits = logits
+
+        nwords = self._get_nwords(inputs, mask)
+
+        # remember this value for later use
+        self.nwords = nwords
+
+        if self.test_mode == 'viterbi':
+            test_output = self.get_viterbi_decoding(logits, nwords)
+        else:
+            # TODO: not supported yet
+            pass
+            # test_output = self.get_marginal_prob(input, mask)
+
+        if self.learn_mode == 'join':
+            # WHY: don't remove this line, useless but remote it will cause bug
+            test_output = tf.cast(test_output, tf.float32)
+            out = test_output
+        else:
+            # TODO: not supported yet
+            pass
+            # if self.test_mode == 'viterbi':
+            #     train_output = self.get_marginal_prob(input, mask)
+            #     out = K.in_train_phase(train_output,
+            #                                           test_output)
+            # else:
+            #     out = test_output
+
+        return out
+
+    def _get_nwords(self, input, mask):
+        if mask is not None:
+            int_mask = K.cast(mask, tf.int8)
+            nwords = self.mask_to_nwords(int_mask)
+        else:
+            # make a mask tensor from input, then used to generate nwords
+            input_energy_shape = tf.shape(input)
+            raw_input_shape = tf.slice(input_energy_shape, [0], [2])
+            alt_mask = tf.ones(raw_input_shape)
+
+            nwords = self.mask_to_nwords(alt_mask)
+
+        return nwords
+
+    def mask_to_nwords(self, mask):
+        nwords = K.cast(K.sum(mask, 1), tf.int64)
+        return nwords
+
+    @staticmethod
+    def shift_left(x, offset=1):
+        assert offset > 0
+        return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)
+
+    @staticmethod
+    def shift_right(x, offset=1):
+        assert offset > 0
+        return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)
+
+    def add_boundary_energy(self, energy, mask, start, end):
+        def expend_scalar_to_3d(x):
+            # expend tensor from shape (x, ) to (1, 1, x)
+            return K.expand_dims(K.expand_dims(x, 0), 0)
+
+        start = expend_scalar_to_3d(start)
+        end = expend_scalar_to_3d(end)
+        if mask is None:
+            energy = K.concatenate(
+                [energy[:, :1, :] + start, energy[:, 1:, :]],
+                axis=1)
+            energy = K.concatenate(
+                [energy[:, :-1, :], energy[:, -1:, :] + end],
+                axis=1)
+        else:
+            mask = K.expand_dims(K.cast(mask, K.floatx()), axis=-1)
+            start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
+
+            # original code:
+            # end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
+            # Note: original code should have a bug,
+            # need confirmed with @lzfelix (Luiz Felix)
+            # patch applied
+            end_mask = K.cast(K.greater(mask, self.shift_left(mask)), K.floatx())
+            energy = energy + start_mask * start
+            energy = energy + end_mask * end
+        return energy
+
+    def get_viterbi_decoding(self, input_energy, nwords):
+        pred_ids, _ = crf_decode(input_energy, self.chain_kernel, nwords)
+
+        return pred_ids
+
+    def get_config(self):
+        # used for loading model from disk
+        config = {
+            'units': self.units,
+            'learn_mode': self.learn_mode,
+            'test_mode': self.test_mode,
+            'use_boundary': self.use_boundary,
+            'use_bias': self.use_bias,
+            'sparse_target': self.sparse_target,
+            'kernel_initializer': initializers.serialize(
+                self.kernel_initializer),
+            'chain_initializer': initializers.serialize(
+                self.chain_initializer),
+            'boundary_initializer': initializers.serialize(
+                self.boundary_initializer),
+            'bias_initializer': initializers.serialize(self.bias_initializer),
+            'activation': activations.serialize(self.activation),
+            'kernel_regularizer': regularizers.serialize(
+                self.kernel_regularizer),
+            'chain_regularizer': regularizers.serialize(
+                self.chain_regularizer),
+            'boundary_regularizer': regularizers.serialize(
+                self.boundary_regularizer),
+            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+            'kernel_constraint': constraints.serialize(self.kernel_constraint),
+            'chain_constraint': constraints.serialize(self.chain_constraint),
+            'boundary_constraint': constraints.serialize(
+                self.boundary_constraint),
+            'bias_constraint': constraints.serialize(self.bias_constraint),
+            'input_dim': self.input_dim,
+            'unroll': self.unroll}
+        base_config = super(CRF, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def compute_output_shape(self, input_shape):
+        output_shape = input_shape[:2]
+        return output_shape
+
+    def compute_mask(self, input, mask=None):
+        if mask is not None and self.learn_mode == 'join':
+            # transform mask from shape (?, ?) to (?, )
+            new_mask = K.any(mask, axis=1)
+            return new_mask
+
+        return mask
+
+    def get_decode_result(self, logits, mask):
+        nwords = K.cast(K.sum(mask, 1), tf.int64)
+
         pred_ids, _ = crf_decode(logits, self.chain_kernel, nwords)
 
-    def _dense_layer(self, input):
-        # TODO: can simply use tf.keras.layers.dense ?
-        return self.activation(tf.matmul(input, self.kernel) + self.bias)
+        return pred_ids
 
+    def get_negative_log_likelihood(self, y_true):
+        y_preds = self.logits
+
+        nwords = self.nwords
+
+        y_preds = K.cast(y_preds, tf.float32)
+        y_true = K.cast(y_true, tf.int32)
+        nwords = K.cast(nwords, tf.int32)
+        self.chain_kernel = K.cast(self.chain_kernel, tf.float32)
+
+        log_likelihood, _ = crf_log_likelihood(
+            y_preds,
+            y_true,
+            nwords,
+            self.chain_kernel
+        )
+
+        return -log_likelihood
+
+    def get_accuracy(self, y_true, y_pred):
+        judge = K.cast(K.equal(y_pred, y_true), K.floatx())
+        if self.mask is None:
+            return K.mean(judge)
+        else:
+            mask = K.cast(self.mask, K.floatx())
+            return K.sum(judge * mask) / K.sum(mask)
 
-if __name__ == "__main__":
-    layer = CRF(10)
-    print(layer(tf.zeros([10, 5])))
-    print(layer.trainable_variables)
+    def _dense_layer(self, input_):
+        return self.activation(K.dot(input_, self.kernel) + self.bias)
diff --git a/tensorflow_addons/losses/BUILD b/tensorflow_addons/losses/BUILD
index 927f821fe2..d43936ab83 100644
--- a/tensorflow_addons/losses/BUILD
+++ b/tensorflow_addons/losses/BUILD
@@ -13,6 +13,7 @@ py_library(
         "npairs.py",
         "sparsemax_loss.py",
         "triplet.py",
+        "crf_losses.py"
     ],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow_addons/losses/README.md b/tensorflow_addons/losses/README.md
index e0951d41c1..81b7b27b66 100644
--- a/tensorflow_addons/losses/README.md
+++ b/tensorflow_addons/losses/README.md
@@ -9,6 +9,8 @@
 | npairs | @WindQAQ | windqaq@gmail.com |
 | sparsemax_loss | @AndreasMadsen | amwwebdk+github@gmail.com |
 | triplet |   |  |
+| crf | @howl-anderson | u1mail2me@gmail.com |
+
 
 ## Components
 | Submodule | Loss  | Reference               |
@@ -19,6 +21,7 @@
 | npairs | NpairsLoss | http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf |
 | sparsemax_loss | SparsemaxLoss |  https://arxiv.org/abs/1602.02068 |
 | triplet | TripletSemiHardLoss | https://arxiv.org/abs/1503.03832       |
+| crf | CRF | https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers |
 
 
 ## Contribution Guidelines
diff --git a/tensorflow_addons/losses/crf_losses.py b/tensorflow_addons/losses/crf_losses.py
index 5729f7eb60..25e9b74e61 100644
--- a/tensorflow_addons/losses/crf_losses.py
+++ b/tensorflow_addons/losses/crf_losses.py
@@ -1,49 +1,24 @@
-from keras import backend as K
-from keras.losses import categorical_crossentropy
-from keras.losses import sparse_categorical_crossentropy
+from tensorflow.python.keras.losses import sparse_categorical_crossentropy, \
+    categorical_crossentropy
 
+from tensorflow_addons.utils import keras_utils
 
-def crf_nll(y_true, y_pred):
-    """The negative log-likelihood for linear chain Conditional Random Field (CRF).
-    This loss function is only used when the `layers.CRF` layer
-    is trained in the "join" mode.
-    # Arguments
-        y_true: tensor with true targets.
-        y_pred: tensor with predicted targets.
-    # Returns
-        A scalar representing corresponding to the negative log-likelihood.
-    # Raises
-        TypeError: If CRF is not the last layer.
-    # About GitHub
-        If you open an issue or a pull request about CRF, please
-        add `cc @lzfelix` to notify Luiz Felix.
-    """
 
+def crf_nll(y_true, y_pred):
     crf, idx = y_pred._keras_history[:2]
-    if crf._outbound_nodes:
-        raise TypeError('When learn_model="join", CRF must be the last layer.')
-    if crf.sparse_target:
-        y_true = K.one_hot(K.cast(y_true[:, :, 0], 'int32'), crf.units)
-    X = crf._inbound_nodes[idx].input_tensors[0]
-    mask = crf._inbound_nodes[idx].input_masks[0]
-    nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
+
+    node = crf._inbound_nodes[idx]
+
+    nloglik = crf.get_negative_log_likelihood(y_true)
+
     return nloglik
 
 
+@keras_utils.register_keras_custom_object
 def crf_loss(y_true, y_pred):
-    """General CRF loss function depending on the learning mode.
-    # Arguments
-        y_true: tensor with true targets.
-        y_pred: tensor with predicted targets.
-    # Returns
-        If the CRF layer is being trained in the join mode, returns the negative
-        log-likelihood. Otherwise returns the categorical crossentropy implemented
-        by the underlying Keras backend.
-    # About GitHub
-        If you open an issue or a pull request about CRF, please
-        add `cc @lzfelix` to notify Luiz Felix.
-    """
+    # TODO: change to tf 2.0 class based implementation
     crf, idx = y_pred._keras_history[:2]
+
     if crf.learn_mode == 'join':
         return crf_nll(y_true, y_pred)
     else:
diff --git a/tensorflow_addons/metrics/BUILD b/tensorflow_addons/metrics/BUILD
index 33db5e905f..64d13f0e6d 100644
--- a/tensorflow_addons/metrics/BUILD
+++ b/tensorflow_addons/metrics/BUILD
@@ -11,6 +11,7 @@ py_library(
         "multilabel_confusion_matrix.py",
         "r_square.py",
         "utils.py",
+        "crf_accuracy.py"
     ],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow_addons/metrics/crf_accuracy.py b/tensorflow_addons/metrics/crf_accuracy.py
index e69de29bb2..237937dd84 100644
--- a/tensorflow_addons/metrics/crf_accuracy.py
+++ b/tensorflow_addons/metrics/crf_accuracy.py
@@ -0,0 +1,51 @@
+import tensorflow as tf
+
+from tensorflow_addons.utils import keras_utils
+
+
+def _get_accuracy(y_true, y_pred, mask, sparse_target=False):
+    y_pred = tf.keras.backend.argmax(y_pred, -1)
+    if sparse_target:
+        y_true = tf.keras.backend.cast(y_true[:, :, 0], tf.keras.backend.dtype(y_pred))
+    else:
+        y_true = tf.keras.backend.argmax(y_true, -1)
+    judge = tf.keras.backend.cast(tf.keras.backend.equal(y_pred, y_true), tf.keras.backend.floatx())
+    if mask is None:
+        return tf.keras.backend.mean(judge)
+    else:
+        mask = tf.keras.backend.cast(mask, tf.keras.backend.floatx())
+        return tf.keras.backend.sum(judge * mask) / tf.keras.backend.sum(mask)
+
+
+def crf_viterbi_accuracy(y_true, y_pred):
+    """
+    Use Viterbi algorithm to get best path, and compute its accuracy.
+    `y_pred` must be an output from CRF.
+    """
+    crf, idx = y_pred._keras_history[:2]
+    return crf.get_accuracy(y_true, y_pred)
+
+
+def crf_marginal_accuracy(y_true, y_pred):
+    """
+    Use time-wise marginal argmax as prediction.
+    `y_pred` must be an output from CRF with `learn_mode="marginal"`.
+    """
+    crf, idx = y_pred._keras_history[:2]
+    X = crf._inbound_nodes[idx].input_tensors[0]
+    mask = crf._inbound_nodes[idx].input_masks[0]
+    y_pred = crf.get_marginal_prob(X, mask)
+    return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)
+
+
+@keras_utils.register_keras_custom_object
+def crf_accuracy(y_true, y_pred):
+    # TODO: using tf 2.0 class based implementation
+    """
+    Ge default accuracy based on CRF `test_mode`.
+    """
+    crf, idx = y_pred._keras_history[:2]
+    if crf.test_mode == 'viterbi':
+        return crf_viterbi_accuracy(y_true, y_pred)
+    else:
+        return crf_marginal_accuracy(y_true, y_pred)
\ No newline at end of file

From 0b58e5f0b828764d742464ee6e13c8b795c99e4e Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Thu, 3 Oct 2019 20:39:27 +0800
Subject: [PATCH 13/52] Update code and test cases; not ready for release yet

---
 tensorflow_addons/layers/crf.py           | 348 ++++++++++++----------
 tensorflow_addons/layers/crf_test.py      |  76 +++++
 tensorflow_addons/losses/crf_loss.py      |  43 +++
 tensorflow_addons/losses/crf_loss_test.py | 106 +++++++
 tensorflow_addons/losses/crf_losses.py    |  28 --
 5 files changed, 413 insertions(+), 188 deletions(-)
 create mode 100644 tensorflow_addons/layers/crf_test.py
 create mode 100644 tensorflow_addons/losses/crf_loss.py
 create mode 100644 tensorflow_addons/losses/crf_loss_test.py
 delete mode 100644 tensorflow_addons/losses/crf_losses.py

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 67a72b4019..96e63858ff 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -1,40 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Orginal implementation from keras_contrib/layers/crf
+# ==============================================================================
+"""Implementing Conditional Random Field layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import tensorflow as tf
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import initializers, regularizers, constraints, \
-    activations
-from tensorflow.python.keras.layers import InputSpec, Layer
 from tensorflow_addons.text.crf import crf_decode, crf_log_likelihood
 from tensorflow_addons.utils import keras_utils
 
 """
 TODO
 
-* learn_mode is not supported
-* test_mode is not supported
-* sparse_target is not supported
-* use_boundary need test
-* input_dim is not know how to use
-* unroll is not supported
-
-* left padding of mask is not supported
-
+* decide input_dim should be keep or drop
+* left padding of mask is not supported (future version should fix it)
 * not test yet if CRF is the first layer
+* Add docs
 """
 
-
 @keras_utils.register_keras_custom_object
-class CRF(Layer):
-    def __init__(self, units,
-                 learn_mode='join',
-                 test_mode=None,
-                 sparse_target=False,
+class CRF(tf.keras.layers.Layer):
+    def __init__(self,
+                 units,
                  use_boundary=False,
                  use_bias=True,
-                 activation='linear',
-                 kernel_initializer='glorot_uniform',
-                 chain_initializer='orthogonal',
-                 bias_initializer='zeros',
-                 boundary_initializer='zeros',
+                 activation="linear",
+                 kernel_initializer="glorot_uniform",
+                 chain_initializer="orthogonal",
+                 bias_initializer="zeros",
+                 boundary_initializer="zeros",
                  kernel_regularizer=None,
                  chain_regularizer=None,
                  boundary_regularizer=None,
@@ -44,46 +53,43 @@ def __init__(self, units,
                  boundary_constraint=None,
                  bias_constraint=None,
                  input_dim=None,
-                 unroll=False,
                  **kwargs):
         super(CRF, self).__init__(**kwargs)
 
         # setup mask supporting flag, used by base class (the Layer)
+        # because base class's init method will set it to False unconditionally
+        # So this assigned must be executed after call base class's init method
         self.supports_masking = True
 
         self.units = units  # numbers of tags
 
-        self.learn_mode = learn_mode
-        assert self.learn_mode in ['join', 'marginal']
-
-        self.test_mode = test_mode
-        if self.test_mode is None:
-            self.test_mode = 'viterbi' if self.learn_mode == 'join' else 'marginal'
-        else:
-            assert self.test_mode in ['viterbi', 'marginal']
-        self.sparse_target = sparse_target
         self.use_boundary = use_boundary
         self.use_bias = use_bias
 
-        self.activation = activations.get(activation)
+        self.activation = tf.keras.activations.get(activation)
 
-        self.kernel_initializer = initializers.get(kernel_initializer)
-        self.chain_initializer = initializers.get(chain_initializer)
-        self.boundary_initializer = initializers.get(boundary_initializer)
-        self.bias_initializer = initializers.get(bias_initializer)
+        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+        self.chain_initializer = tf.keras.initializers.get(chain_initializer)
+        self.boundary_initializer = tf.keras.initializers.get(
+            boundary_initializer)
+        self.bias_initializer = tf.keras.initializers.get(bias_initializer)
 
-        self.kernel_regularizer = regularizers.get(kernel_regularizer)
-        self.chain_regularizer = regularizers.get(chain_regularizer)
-        self.boundary_regularizer = regularizers.get(boundary_regularizer)
-        self.bias_regularizer = regularizers.get(bias_regularizer)
+        self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+        self.chain_regularizer = tf.keras.regularizers.get(chain_regularizer)
+        self.boundary_regularizer = tf.keras.regularizers.get(
+            boundary_regularizer)
+        self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
 
-        self.kernel_constraint = constraints.get(kernel_constraint)
-        self.chain_constraint = constraints.get(chain_constraint)
-        self.boundary_constraint = constraints.get(boundary_constraint)
-        self.bias_constraint = constraints.get(bias_constraint)
+        self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+        self.chain_constraint = tf.keras.constraints.get(chain_constraint)
+        self.boundary_constraint = tf.keras.constraints.get(
+            boundary_constraint)
+        self.bias_constraint = tf.keras.constraints.get(bias_constraint)
 
         self.input_dim = input_dim
-        self.unroll = unroll
+
+        # values will be assigned in method
+        self.input_spec = None
 
         # value remembered for loss/metrics function
         self.logits = None
@@ -99,45 +105,58 @@ def __init__(self, units,
 
     def build(self, input_shape):
         input_shape = tuple(tf.TensorShape(input_shape).as_list())
-        self.input_spec = [InputSpec(shape=input_shape)]
+
+        # see API docs of InputSpec for more detail
+        self.input_spec = [tf.keras.layers.InputSpec(shape=input_shape)]
+
         self.input_dim = input_shape[-1]
 
         # weights that mapping arbitrary tensor to correct shape
-        self.kernel = self.add_weight(shape=(self.input_dim, self.units),
-                                      name='kernel',
-                                      initializer=self.kernel_initializer,
-                                      regularizer=self.kernel_regularizer,
-                                      constraint=self.kernel_constraint)
+        self.kernel = self.add_weight(
+            shape=(self.input_dim, self.units),
+            name="kernel",
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+        )
 
         # weights that work as transfer probability of each tags
-        self.chain_kernel = self.add_weight(shape=(self.units, self.units),
-                                            name='chain_kernel',
-                                            initializer=self.chain_initializer,
-                                            regularizer=self.chain_regularizer,
-                                            constraint=self.chain_constraint)
+        self.chain_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            name="chain_kernel",
+            initializer=self.chain_initializer,
+            regularizer=self.chain_regularizer,
+            constraint=self.chain_constraint,
+        )
 
         # bias that works with self.kernel
         if self.use_bias:
-            self.bias = self.add_weight(shape=(self.units,),
-                                        name='bias',
-                                        initializer=self.bias_initializer,
-                                        regularizer=self.bias_regularizer,
-                                        constraint=self.bias_constraint)
+            self.bias = self.add_weight(
+                shape=(self.units, ),
+                name="bias",
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                constraint=self.bias_constraint,
+            )
         else:
             self.bias = 0
 
         # weight of <START> to tag probability and tag to <END> probability
         if self.use_boundary:
-            self.left_boundary = self.add_weight(shape=(self.units,),
-                                                 name='left_boundary',
-                                                 initializer=self.boundary_initializer,
-                                                 regularizer=self.boundary_regularizer,
-                                                 constraint=self.boundary_constraint)
-            self.right_boundary = self.add_weight(shape=(self.units,),
-                                                  name='right_boundary',
-                                                  initializer=self.boundary_initializer,
-                                                  regularizer=self.boundary_regularizer,
-                                                  constraint=self.boundary_constraint)
+            self.left_boundary = self.add_weight(
+                shape=(self.units, ),
+                name="left_boundary",
+                initializer=self.boundary_initializer,
+                regularizer=self.boundary_regularizer,
+                constraint=self.boundary_constraint,
+            )
+            self.right_boundary = self.add_weight(
+                shape=(self.units, ),
+                name="right_boundary",
+                initializer=self.boundary_initializer,
+                regularizer=self.boundary_regularizer,
+                constraint=self.boundary_constraint,
+            )
 
         # or directly call self.built = True
         super(CRF, self).build(input_shape)
@@ -146,7 +165,8 @@ def call(self, inputs, mask=None, **kwargs):
         # mask: Tensor(shape=(?, ?), dtype=bool) or None
 
         if mask is not None:
-            assert K.ndim(mask) == 2, 'Input mask to CRF must have dim 2 if not None'
+            assert (tf.keras.backend.ndim(mask) == 2
+                    ), "Input mask to CRF must have dim 2 if not None"
 
         # remember this value for later use
         self.mask = mask
@@ -155,8 +175,8 @@ def call(self, inputs, mask=None, **kwargs):
 
         # appending boundary probability info
         if self.use_boundary:
-            logits = self.add_boundary_energy(
-                logits, mask, self.left_boundary, self.right_boundary)
+            logits = self.add_boundary_energy(logits, mask, self.left_boundary,
+                                              self.right_boundary)
 
         # remember this value for later use
         self.logits = logits
@@ -166,32 +186,17 @@ def call(self, inputs, mask=None, **kwargs):
         # remember this value for later use
         self.nwords = nwords
 
-        if self.test_mode == 'viterbi':
-            test_output = self.get_viterbi_decoding(logits, nwords)
-        else:
-            # TODO: not supported yet
-            pass
-            # test_output = self.get_marginal_prob(input, mask)
-
-        if self.learn_mode == 'join':
-            # WHY: don't remove this line, useless but remote it will cause bug
-            test_output = tf.cast(test_output, tf.float32)
-            out = test_output
-        else:
-            # TODO: not supported yet
-            pass
-            # if self.test_mode == 'viterbi':
-            #     train_output = self.get_marginal_prob(input, mask)
-            #     out = K.in_train_phase(train_output,
-            #                                           test_output)
-            # else:
-            #     out = test_output
+        test_output = self.get_viterbi_decoding(logits, nwords)
+
+        # WHY: don't remove this line, useless but remote it will cause bug
+        test_output = tf.cast(test_output, tf.float32)
+        out = test_output
 
         return out
 
     def _get_nwords(self, input, mask):
         if mask is not None:
-            int_mask = K.cast(mask, tf.int8)
+            int_mask = tf.keras.backend.cast(mask, tf.int8)
             nwords = self.mask_to_nwords(int_mask)
         else:
             # make a mask tensor from input, then used to generate nwords
@@ -204,43 +209,59 @@ def _get_nwords(self, input, mask):
         return nwords
 
     def mask_to_nwords(self, mask):
-        nwords = K.cast(K.sum(mask, 1), tf.int64)
+        nwords = tf.keras.backend.cast(tf.keras.backend.sum(mask, 1), tf.int64)
         return nwords
 
     @staticmethod
     def shift_left(x, offset=1):
         assert offset > 0
-        return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)
+        return tf.keras.backend.concatenate(
+            [x[:, offset:],
+             tf.keras.backend.zeros_like(x[:, :offset])],
+            axis=1)
 
     @staticmethod
     def shift_right(x, offset=1):
         assert offset > 0
-        return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)
+        return tf.keras.backend.concatenate(
+            [tf.keras.backend.zeros_like(x[:, :offset]), x[:, :-offset]],
+            axis=1)
 
     def add_boundary_energy(self, energy, mask, start, end):
         def expend_scalar_to_3d(x):
             # expend tensor from shape (x, ) to (1, 1, x)
-            return K.expand_dims(K.expand_dims(x, 0), 0)
+            return tf.keras.backend.expand_dims(
+                tf.keras.backend.expand_dims(x, 0), 0)
 
         start = expend_scalar_to_3d(start)
         end = expend_scalar_to_3d(end)
         if mask is None:
-            energy = K.concatenate(
-                [energy[:, :1, :] + start, energy[:, 1:, :]],
-                axis=1)
-            energy = K.concatenate(
-                [energy[:, :-1, :], energy[:, -1:, :] + end],
-                axis=1)
+            energy = tf.keras.backend.concatenate(
+                [energy[:, :1, :] + start, energy[:, 1:, :]], axis=1)
+            energy = tf.keras.backend.concatenate(
+                [energy[:, :-1, :], energy[:, -1:, :] + end], axis=1)
         else:
-            mask = K.expand_dims(K.cast(mask, K.floatx()), axis=-1)
-            start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
+            mask = tf.keras.backend.expand_dims(tf.keras.backend.cast(
+                mask, tf.keras.backend.floatx()),
+                                                axis=-1)
+            start_mask = tf.keras.backend.cast(
+                tf.keras.backend.greater(mask, self.shift_right(mask)),
+                tf.keras.backend.floatx(),
+            )
 
             # original code:
-            # end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
+            # end_mask = K.cast(
+            #   K.greater(self.shift_left(mask), mask),
+            #   K.floatx()
+            # )
             # Note: original code should have a bug,
-            # need confirmed with @lzfelix (Luiz Felix)
-            # patch applied
-            end_mask = K.cast(K.greater(mask, self.shift_left(mask)), K.floatx())
+            # need confirmed with @lzfelix (Luiz Felix),
+            # mailed to him but no reply for months,
+            # patch applied here.
+            end_mask = tf.keras.backend.cast(
+                tf.keras.backend.greater(mask, self.shift_left(mask)),
+                tf.keras.backend.floatx(),
+            )
             energy = energy + start_mask * start
             energy = energy + end_mask * end
         return energy
@@ -253,34 +274,41 @@ def get_viterbi_decoding(self, input_energy, nwords):
     def get_config(self):
         # used for loading model from disk
         config = {
-            'units': self.units,
-            'learn_mode': self.learn_mode,
-            'test_mode': self.test_mode,
-            'use_boundary': self.use_boundary,
-            'use_bias': self.use_bias,
-            'sparse_target': self.sparse_target,
-            'kernel_initializer': initializers.serialize(
-                self.kernel_initializer),
-            'chain_initializer': initializers.serialize(
-                self.chain_initializer),
-            'boundary_initializer': initializers.serialize(
-                self.boundary_initializer),
-            'bias_initializer': initializers.serialize(self.bias_initializer),
-            'activation': activations.serialize(self.activation),
-            'kernel_regularizer': regularizers.serialize(
-                self.kernel_regularizer),
-            'chain_regularizer': regularizers.serialize(
-                self.chain_regularizer),
-            'boundary_regularizer': regularizers.serialize(
-                self.boundary_regularizer),
-            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'kernel_constraint': constraints.serialize(self.kernel_constraint),
-            'chain_constraint': constraints.serialize(self.chain_constraint),
-            'boundary_constraint': constraints.serialize(
-                self.boundary_constraint),
-            'bias_constraint': constraints.serialize(self.bias_constraint),
-            'input_dim': self.input_dim,
-            'unroll': self.unroll}
+            "units":
+            self.units,
+            "use_boundary":
+            self.use_boundary,
+            "use_bias":
+            self.use_bias,
+            "kernel_initializer":
+            tf.keras.initializers.serialize(self.kernel_initializer),
+            "chain_initializer":
+            tf.keras.initializers.serialize(self.chain_initializer),
+            "boundary_initializer":
+            tf.keras.initializers.serialize(self.boundary_initializer),
+            "bias_initializer":
+            tf.keras.initializers.serialize(self.bias_initializer),
+            "activation":
+            tf.keras.activations.serialize(self.activation),
+            "kernel_regularizer":
+            tf.keras.regularizers.serialize(self.kernel_regularizer),
+            "chain_regularizer":
+            tf.keras.regularizers.serialize(self.chain_regularizer),
+            "boundary_regularizer":
+            tf.keras.regularizers.serialize(self.boundary_regularizer),
+            "bias_regularizer":
+            tf.keras.regularizers.serialize(self.bias_regularizer),
+            "kernel_constraint":
+            tf.keras.constraints.serialize(self.kernel_constraint),
+            "chain_constraint":
+            tf.keras.constraints.serialize(self.chain_constraint),
+            "boundary_constraint":
+            tf.keras.constraints.serialize(self.boundary_constraint),
+            "bias_constraint":
+            tf.keras.constraints.serialize(self.bias_constraint),
+            "input_dim":
+            self.input_dim,
+        }
         base_config = super(CRF, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
@@ -289,15 +317,15 @@ def compute_output_shape(self, input_shape):
         return output_shape
 
     def compute_mask(self, input, mask=None):
-        if mask is not None and self.learn_mode == 'join':
+        if mask is not None:
             # transform mask from shape (?, ?) to (?, )
-            new_mask = K.any(mask, axis=1)
+            new_mask = tf.keras.backend.any(mask, axis=1)
             return new_mask
 
         return mask
 
     def get_decode_result(self, logits, mask):
-        nwords = K.cast(K.sum(mask, 1), tf.int64)
+        nwords = tf.keras.backend.cast(tf.keras.backend.sum(mask, 1), tf.int64)
 
         pred_ids, _ = crf_decode(logits, self.chain_kernel, nwords)
 
@@ -308,27 +336,27 @@ def get_negative_log_likelihood(self, y_true):
 
         nwords = self.nwords
 
-        y_preds = K.cast(y_preds, tf.float32)
-        y_true = K.cast(y_true, tf.int32)
-        nwords = K.cast(nwords, tf.int32)
-        self.chain_kernel = K.cast(self.chain_kernel, tf.float32)
+        y_preds = tf.keras.backend.cast(y_preds, tf.float32)
+        y_true = tf.keras.backend.cast(y_true, tf.int32)
+        nwords = tf.keras.backend.cast(nwords, tf.int32)
+        self.chain_kernel = tf.keras.backend.cast(self.chain_kernel,
+                                                  tf.float32)
 
-        log_likelihood, _ = crf_log_likelihood(
-            y_preds,
-            y_true,
-            nwords,
-            self.chain_kernel
-        )
+        log_likelihood, _ = crf_log_likelihood(y_preds, y_true, nwords,
+                                               self.chain_kernel)
 
         return -log_likelihood
 
     def get_accuracy(self, y_true, y_pred):
-        judge = K.cast(K.equal(y_pred, y_true), K.floatx())
+        judge = tf.keras.backend.cast(tf.keras.backend.equal(y_pred, y_true),
+                                      tf.keras.backend.floatx())
         if self.mask is None:
-            return K.mean(judge)
+            return tf.keras.backend.mean(judge)
         else:
-            mask = K.cast(self.mask, K.floatx())
-            return K.sum(judge * mask) / K.sum(mask)
+            mask = tf.keras.backend.cast(self.mask, tf.keras.backend.floatx())
+            return (tf.keras.backend.sum(judge * mask) /
+                    tf.keras.backend.sum(mask))
 
     def _dense_layer(self, input_):
-        return self.activation(K.dot(input_, self.kernel) + self.bias)
+        return self.activation(
+            tf.keras.backend.dot(input_, self.kernel) + self.bias)
diff --git a/tensorflow_addons/layers/crf_test.py b/tensorflow_addons/layers/crf_test.py
new file mode 100644
index 0000000000..de37e74679
--- /dev/null
+++ b/tensorflow_addons/layers/crf_test.py
@@ -0,0 +1,76 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Conditional Random Field layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+from tensorflow_addons.layers.crf import CRF
+from tensorflow_addons.utils import test_utils
+
+
+@test_utils.run_all_in_graph_and_eager_modes
+class TestCRF(tf.test.TestCase):
+    def test_unmasked_viterbi_decode(self):
+        x = np.array([
+            [
+            #    O     B-X    I-X    B-Y    I-Y
+                [0.,    1.,   0.,     0.,   0.],
+                [0.,    0.,   1.,     0.,   0.],
+                [0.,    0.,   1.,     0.,   0.]
+            ],
+            [
+            #    O     B-X    I-X    B-Y    I-Y
+                [0.,    1.,   0.,     0.,   0.],
+                [0.,    1.,   0.,     0.,   0.],
+                [0.,    1.,   0.,     0.,   0.]
+            ]
+        ])  # yapf: disable
+
+        expected_y = np.array([
+            [1, 2, 2],  # B-X  I-X  I-X
+            [1, 1, 1]   # B-X  B-X  B-X
+        ])  # yapf: disable
+
+        transitions = np.ones([5, 5])
+        transitions_from_start = np.ones(5)
+        transitions_to_end = np.ones(5)
+
+        test_utils.layer_test(
+            CRF,
+            kwargs={
+                "units":
+                5,
+                "use_kernel":
+                False,  # disable kernel transform
+                "chain_initializer":
+                tf.keras.initializers.Constant(transitions),
+                "use_boundary":
+                True,
+                "left_boundary_initializer":
+                tf.keras.initializers.Constant(transitions_from_start),
+                "right_boundary_initializer":
+                tf.keras.initializers.Constant(transitions_to_end),
+            },
+            input_data=x,
+            expected_output=expected_y,
+        )
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/tensorflow_addons/losses/crf_loss.py b/tensorflow_addons/losses/crf_loss.py
new file mode 100644
index 0000000000..1dcdf95ce0
--- /dev/null
+++ b/tensorflow_addons/losses/crf_loss.py
@@ -0,0 +1,43 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementing Conditional Random Field loss."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow_addons.utils import keras_utils
+
+
+@keras_utils.register_keras_custom_object
+@tf.function
+def crf_loss(y_true, y_pred):
+    crf, idx = y_pred._keras_history[:2]
+
+    nloglik = crf.get_negative_log_likelihood(y_true)
+
+    return nloglik
+
+
+@keras_utils.register_keras_custom_object
+class ConditionalRandomFieldLoss(object):
+    def get_config(self):
+        return {}
+
+    def __call__(self, y_true, y_pred, sample_weight=None):
+        loss_vector = crf_loss(y_true, y_pred)
+
+        return tf.keras.backend.mean(loss_vector)
diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_loss_test.py
new file mode 100644
index 0000000000..a80d055a4e
--- /dev/null
+++ b/tensorflow_addons/losses/crf_loss_test.py
@@ -0,0 +1,106 @@
+## Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Conditional Random Field loss."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import math
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow_addons.layers.crf import CRF
+from tensorflow_addons.losses.crf_loss import ConditionalRandomFieldLoss
+from tensorflow_addons.utils import test_utils
+
+
+@test_utils.run_all_in_graph_and_eager_modes
+class ConditionalRandomFieldLossTest(tf.test.TestCase):
+    def test_forward_works_without_mask(self):
+        self.logits = np.array([
+            [[0, 0, 0.5, 0.5, 0.2], [0, 0, 0.3, 0.3, 0.1], [0, 0, 0.9, 10, 1]],
+            [[0, 0, 0.2, 0.5, 0.2], [0, 0, 3, 0.3, 0.1], [0, 0, 0.9, 1, 1]],
+        ])
+        self.tags = np.array([[2, 3, 4], [3, 2, 2]])
+
+        self.transitions = np.array([
+            [0.1, 0.2, 0.3, 0.4, 0.5],
+            [0.8, 0.3, 0.1, 0.7, 0.9],
+            [-0.3, 2.1, -5.6, 3.4, 4.0],
+            [0.2, 0.4, 0.6, -0.3, -0.4],
+            [1.0, 1.0, 1.0, 1.0, 1.0],
+        ])
+
+        self.transitions_from_start = np.array([0.1, 0.2, 0.3, 0.4, 0.6])
+        self.transitions_to_end = np.array([-0.1, -0.2, 0.3, -0.4, -0.4])
+
+        # Use the CRF Module with fixed transitions to compute the log_likelihood
+        self.crf = CRF(
+            units=5,
+            use_kernel=False,  # disable kernel transform
+            chain_initializer=tf.keras.initializers.Constant(self.transitions),
+            use_boundary=True,
+            left_boundary_initializer=tf.keras.initializers.Constant(
+                self.transitions_from_start),
+            right_boundary_initializer=tf.keras.initializers.Constant(
+                self.transitions_to_end),
+            name="crf_layer",
+        )
+
+        crf_loss_instance = ConditionalRandomFieldLoss()
+
+        model = tf.keras.models.Sequential()
+        model.add(tf.keras.layers.Input(shape=(3, 5)))
+        model.add(self.crf)
+        model.compile("adam", loss={"crf_layer": crf_loss_instance})
+        model.summary()
+
+        log_likelihood = model.train_on_batch(self.logits, self.tags)
+
+        def compute_log_likelihood():
+            # Now compute the log-likelihood manually
+            manual_log_likelihood = 0.0
+
+            # For each instance, manually compute the numerator
+            # (which is just the score for the logits and actual tags)
+            # and the denominator
+            # (which is the log-sum-exp of the scores
+            # for the logits across all possible tags)
+            for logits_i, tags_i in zip(self.logits, self.tags):
+                numerator = self.score(logits_i, tags_i)
+                all_scores = [
+                    self.score(logits_i, tags_j)
+                    for tags_j in itertools.product(range(5), repeat=3)
+                ]
+                denominator = math.log(
+                    sum(math.exp(score) for score in all_scores))
+                # And include them in the manual calculation.
+                manual_log_likelihood += numerator - denominator
+
+            return manual_log_likelihood
+
+        # The manually computed log likelihood should
+        # equal the result of crf.forward.
+        expected_log_likelihood = compute_log_likelihood()
+        unbatched_log_likelihood = -2 * log_likelihood
+
+        self.assertAllClose(expected_log_likelihood, unbatched_log_likelihood)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/tensorflow_addons/losses/crf_losses.py b/tensorflow_addons/losses/crf_losses.py
deleted file mode 100644
index 25e9b74e61..0000000000
--- a/tensorflow_addons/losses/crf_losses.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from tensorflow.python.keras.losses import sparse_categorical_crossentropy, \
-    categorical_crossentropy
-
-from tensorflow_addons.utils import keras_utils
-
-
-def crf_nll(y_true, y_pred):
-    crf, idx = y_pred._keras_history[:2]
-
-    node = crf._inbound_nodes[idx]
-
-    nloglik = crf.get_negative_log_likelihood(y_true)
-
-    return nloglik
-
-
-@keras_utils.register_keras_custom_object
-def crf_loss(y_true, y_pred):
-    # TODO: change to tf 2.0 class based implementation
-    crf, idx = y_pred._keras_history[:2]
-
-    if crf.learn_mode == 'join':
-        return crf_nll(y_true, y_pred)
-    else:
-        if crf.sparse_target:
-            return sparse_categorical_crossentropy(y_true, y_pred)
-        else:
-            return categorical_crossentropy(y_true, y_pred)

From 17bfe5f226f73c452d943547bd9aacbeaa724bf3 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Fri, 4 Oct 2019 11:45:06 +0800
Subject: [PATCH 14/52] Update code

---
 tensorflow_addons/layers/BUILD            |  4 +-
 tensorflow_addons/layers/crf.py           | 18 ++++-----
 tensorflow_addons/losses/BUILD            |  2 +-
 tensorflow_addons/metrics/BUILD           |  2 +-
 tensorflow_addons/metrics/crf_accuracy.py | 46 +----------------------
 tensorflow_addons/metrics/marginal_acc.py |  0
 tensorflow_addons/metrics/viterbi_acc.py  |  0
 7 files changed, 14 insertions(+), 58 deletions(-)
 delete mode 100644 tensorflow_addons/metrics/marginal_acc.py
 delete mode 100644 tensorflow_addons/metrics/viterbi_acc.py

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index 03d87419bd..c494b8793a 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -6,6 +6,7 @@ py_library(
     name = "layers",
     srcs = [
         "__init__.py",
+        "crf.py",
         "gelu.py",
         "maxout.py",
         "normalizations.py",
@@ -13,7 +14,6 @@ py_library(
         "poincare.py",
         "sparsemax.py",
         "wrappers.py",
-        "crf.py"
     ],
     data = [
         "//tensorflow_addons/custom_ops/layers:_correlation_cost_ops.so",
@@ -21,8 +21,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow_addons/activations",
-        "//tensorflow_addons/utils",
         "//tensorflow_addons/text",
+        "//tensorflow_addons/utils",
     ],
 )
 
diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 96e63858ff..a7e8bedbd9 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -23,7 +23,6 @@
 import tensorflow as tf
 from tensorflow_addons.text.crf import crf_decode, crf_log_likelihood
 from tensorflow_addons.utils import keras_utils
-
 """
 TODO
 
@@ -33,6 +32,7 @@
 * Add docs
 """
 
+
 @keras_utils.register_keras_custom_object
 class CRF(tf.keras.layers.Layer):
     def __init__(self,
@@ -132,7 +132,7 @@ def build(self, input_shape):
         # bias that works with self.kernel
         if self.use_bias:
             self.bias = self.add_weight(
-                shape=(self.units, ),
+                shape=(self.units,),
                 name="bias",
                 initializer=self.bias_initializer,
                 regularizer=self.bias_regularizer,
@@ -144,14 +144,14 @@ def build(self, input_shape):
         # weight of <START> to tag probability and tag to <END> probability
         if self.use_boundary:
             self.left_boundary = self.add_weight(
-                shape=(self.units, ),
+                shape=(self.units,),
                 name="left_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
                 constraint=self.boundary_constraint,
             )
             self.right_boundary = self.add_weight(
-                shape=(self.units, ),
+                shape=(self.units,),
                 name="right_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
@@ -241,9 +241,9 @@ def expend_scalar_to_3d(x):
             energy = tf.keras.backend.concatenate(
                 [energy[:, :-1, :], energy[:, -1:, :] + end], axis=1)
         else:
-            mask = tf.keras.backend.expand_dims(tf.keras.backend.cast(
-                mask, tf.keras.backend.floatx()),
-                                                axis=-1)
+            mask = tf.keras.backend.expand_dims(
+                tf.keras.backend.cast(mask, tf.keras.backend.floatx()),
+                axis=-1)
             start_mask = tf.keras.backend.cast(
                 tf.keras.backend.greater(mask, self.shift_right(mask)),
                 tf.keras.backend.floatx(),
@@ -348,8 +348,8 @@ def get_negative_log_likelihood(self, y_true):
         return -log_likelihood
 
     def get_accuracy(self, y_true, y_pred):
-        judge = tf.keras.backend.cast(tf.keras.backend.equal(y_pred, y_true),
-                                      tf.keras.backend.floatx())
+        judge = tf.keras.backend.cast(
+            tf.keras.backend.equal(y_pred, y_true), tf.keras.backend.floatx())
         if self.mask is None:
             return tf.keras.backend.mean(judge)
         else:
diff --git a/tensorflow_addons/losses/BUILD b/tensorflow_addons/losses/BUILD
index d43936ab83..705340a902 100644
--- a/tensorflow_addons/losses/BUILD
+++ b/tensorflow_addons/losses/BUILD
@@ -7,13 +7,13 @@ py_library(
     srcs = [
         "__init__.py",
         "contrastive.py",
+        "crf_loss.py",
         "focal_loss.py",
         "lifted.py",
         "metric_learning.py",
         "npairs.py",
         "sparsemax_loss.py",
         "triplet.py",
-        "crf_losses.py"
     ],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow_addons/metrics/BUILD b/tensorflow_addons/metrics/BUILD
index 64d13f0e6d..f1d32a6212 100644
--- a/tensorflow_addons/metrics/BUILD
+++ b/tensorflow_addons/metrics/BUILD
@@ -7,11 +7,11 @@ py_library(
     srcs = [
         "__init__.py",
         "cohens_kappa.py",
+        "crf_accuracy.py",
         "f_scores.py",
         "multilabel_confusion_matrix.py",
         "r_square.py",
         "utils.py",
-        "crf_accuracy.py"
     ],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow_addons/metrics/crf_accuracy.py b/tensorflow_addons/metrics/crf_accuracy.py
index 237937dd84..e8d0c3b8b3 100644
--- a/tensorflow_addons/metrics/crf_accuracy.py
+++ b/tensorflow_addons/metrics/crf_accuracy.py
@@ -1,51 +1,7 @@
-import tensorflow as tf
-
 from tensorflow_addons.utils import keras_utils
 
 
-def _get_accuracy(y_true, y_pred, mask, sparse_target=False):
-    y_pred = tf.keras.backend.argmax(y_pred, -1)
-    if sparse_target:
-        y_true = tf.keras.backend.cast(y_true[:, :, 0], tf.keras.backend.dtype(y_pred))
-    else:
-        y_true = tf.keras.backend.argmax(y_true, -1)
-    judge = tf.keras.backend.cast(tf.keras.backend.equal(y_pred, y_true), tf.keras.backend.floatx())
-    if mask is None:
-        return tf.keras.backend.mean(judge)
-    else:
-        mask = tf.keras.backend.cast(mask, tf.keras.backend.floatx())
-        return tf.keras.backend.sum(judge * mask) / tf.keras.backend.sum(mask)
-
-
-def crf_viterbi_accuracy(y_true, y_pred):
-    """
-    Use Viterbi algorithm to get best path, and compute its accuracy.
-    `y_pred` must be an output from CRF.
-    """
-    crf, idx = y_pred._keras_history[:2]
-    return crf.get_accuracy(y_true, y_pred)
-
-
-def crf_marginal_accuracy(y_true, y_pred):
-    """
-    Use time-wise marginal argmax as prediction.
-    `y_pred` must be an output from CRF with `learn_mode="marginal"`.
-    """
-    crf, idx = y_pred._keras_history[:2]
-    X = crf._inbound_nodes[idx].input_tensors[0]
-    mask = crf._inbound_nodes[idx].input_masks[0]
-    y_pred = crf.get_marginal_prob(X, mask)
-    return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)
-
-
 @keras_utils.register_keras_custom_object
 def crf_accuracy(y_true, y_pred):
-    # TODO: using tf 2.0 class based implementation
-    """
-    Ge default accuracy based on CRF `test_mode`.
-    """
     crf, idx = y_pred._keras_history[:2]
-    if crf.test_mode == 'viterbi':
-        return crf_viterbi_accuracy(y_true, y_pred)
-    else:
-        return crf_marginal_accuracy(y_true, y_pred)
\ No newline at end of file
+    return crf.get_accuracy(y_true, y_pred)
diff --git a/tensorflow_addons/metrics/marginal_acc.py b/tensorflow_addons/metrics/marginal_acc.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tensorflow_addons/metrics/viterbi_acc.py b/tensorflow_addons/metrics/viterbi_acc.py
deleted file mode 100644
index e69de29bb2..0000000000

From 6b9d25201c750cbeeb9c50940273a8cb9581c7f5 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Fri, 4 Oct 2019 14:26:38 +0800
Subject: [PATCH 15/52] Update README

---
 tensorflow_addons/layers/README.md | 2 +-
 tensorflow_addons/losses/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow_addons/layers/README.md b/tensorflow_addons/layers/README.md
index cbd4c6b9eb..02dbd787c0 100644
--- a/tensorflow_addons/layers/README.md
+++ b/tensorflow_addons/layers/README.md
@@ -23,7 +23,7 @@
 | poincare | PoincareNormalize | https://arxiv.org/abs/1705.08039    |
 | sparsemax| Sparsemax | https://arxiv.org/abs/1602.02068 |
 | wrappers | WeightNormalization | https://arxiv.org/abs/1602.07868 |
-| crf | CRF | https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers |
+| crf | CRF | https://en.wikipedia.org/wiki/Conditional_random_field |
 
 ## Contribution Guidelines
 #### Standard API
diff --git a/tensorflow_addons/losses/README.md b/tensorflow_addons/losses/README.md
index c79599720e..fe8cc26a95 100644
--- a/tensorflow_addons/losses/README.md
+++ b/tensorflow_addons/losses/README.md
@@ -22,7 +22,7 @@
 | npairs | NpairsMultilabelLoss | http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf |
 | sparsemax_loss | SparsemaxLoss |  https://arxiv.org/abs/1602.02068 |
 | triplet | TripletSemiHardLoss | https://arxiv.org/abs/1503.03832       |
-| crf | CRF | https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers |
+| crf_loss | ConditionalRandomFieldLoss | https://en.wikipedia.org/wiki/Conditional_random_field |
 
 
 ## Contribution Guidelines

From e961f08b112829ee2d4ff9757ffcf1348c834882 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Fri, 4 Oct 2019 20:44:55 +0800
Subject: [PATCH 16/52] Update code and BUILD: passed CI testing

---
 tensorflow_addons/layers/BUILD            | 13 +++++
 tensorflow_addons/layers/crf.py           | 60 ++++++++++++++---------
 tensorflow_addons/layers/crf_test.py      | 27 +++++-----
 tensorflow_addons/losses/BUILD            | 13 +++++
 tensorflow_addons/metrics/crf_accuracy.py | 20 ++++++++
 5 files changed, 96 insertions(+), 37 deletions(-)

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index c494b8793a..d43ee00451 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -103,3 +103,16 @@ py_test(
         ":layers",
     ],
 )
+
+py_test(
+    name = "crf_test",
+    size = "small",
+    srcs = [
+        "crf_test.py",
+    ],
+    main = "crf_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers",
+    ],
+)
diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index a7e8bedbd9..3e47f04731 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -39,6 +39,7 @@ def __init__(self,
                  units,
                  use_boundary=False,
                  use_bias=True,
+                 use_kernel=True,
                  activation="linear",
                  kernel_initializer="glorot_uniform",
                  chain_initializer="orthogonal",
@@ -65,6 +66,7 @@ def __init__(self,
 
         self.use_boundary = use_boundary
         self.use_bias = use_bias
+        self.use_kernel = use_kernel
 
         self.activation = tf.keras.activations.get(activation)
 
@@ -111,14 +113,15 @@ def build(self, input_shape):
 
         self.input_dim = input_shape[-1]
 
-        # weights that mapping arbitrary tensor to correct shape
-        self.kernel = self.add_weight(
-            shape=(self.input_dim, self.units),
-            name="kernel",
-            initializer=self.kernel_initializer,
-            regularizer=self.kernel_regularizer,
-            constraint=self.kernel_constraint,
-        )
+        if self.use_kernel:
+            # weights that mapping arbitrary tensor to correct shape
+            self.kernel = self.add_weight(
+                shape=(self.input_dim, self.units),
+                name="kernel",
+                initializer=self.kernel_initializer,
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+            )
 
         # weights that work as transfer probability of each tags
         self.chain_kernel = self.add_weight(
@@ -130,9 +133,9 @@ def build(self, input_shape):
         )
 
         # bias that works with self.kernel
-        if self.use_bias:
+        if self.use_kernel and self.use_bias:
             self.bias = self.add_weight(
-                shape=(self.units,),
+                shape=(self.units, ),
                 name="bias",
                 initializer=self.bias_initializer,
                 regularizer=self.bias_regularizer,
@@ -144,14 +147,14 @@ def build(self, input_shape):
         # weight of <START> to tag probability and tag to <END> probability
         if self.use_boundary:
             self.left_boundary = self.add_weight(
-                shape=(self.units,),
+                shape=(self.units, ),
                 name="left_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
                 constraint=self.boundary_constraint,
             )
             self.right_boundary = self.add_weight(
-                shape=(self.units,),
+                shape=(self.units, ),
                 name="right_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
@@ -188,19 +191,19 @@ def call(self, inputs, mask=None, **kwargs):
 
         test_output = self.get_viterbi_decoding(logits, nwords)
 
-        # WHY: don't remove this line, useless but remote it will cause bug
-        test_output = tf.cast(test_output, tf.float32)
+        # WHY: don't remove this line, useless but remote it will cause bug, fix it later
+        # test_output = tf.cast(test_output, self.dtype or tf.keras.backend.floatx())
         out = test_output
 
         return out
 
-    def _get_nwords(self, input, mask):
+    def _get_nwords(self, input_, mask):
         if mask is not None:
             int_mask = tf.keras.backend.cast(mask, tf.int8)
             nwords = self.mask_to_nwords(int_mask)
         else:
             # make a mask tensor from input, then used to generate nwords
-            input_energy_shape = tf.shape(input)
+            input_energy_shape = tf.shape(input_)
             raw_input_shape = tf.slice(input_energy_shape, [0], [2])
             alt_mask = tf.ones(raw_input_shape)
 
@@ -242,11 +245,10 @@ def expend_scalar_to_3d(x):
                 [energy[:, :-1, :], energy[:, -1:, :] + end], axis=1)
         else:
             mask = tf.keras.backend.expand_dims(
-                tf.keras.backend.cast(mask, tf.keras.backend.floatx()),
-                axis=-1)
+                tf.keras.backend.cast(mask, start.dtype), axis=-1)
             start_mask = tf.keras.backend.cast(
                 tf.keras.backend.greater(mask, self.shift_right(mask)),
-                tf.keras.backend.floatx(),
+                start.dtype,
             )
 
             # original code:
@@ -260,13 +262,14 @@ def expend_scalar_to_3d(x):
             # patch applied here.
             end_mask = tf.keras.backend.cast(
                 tf.keras.backend.greater(mask, self.shift_left(mask)),
-                tf.keras.backend.floatx(),
+                end.dtype,
             )
             energy = energy + start_mask * start
             energy = energy + end_mask * end
         return energy
 
     def get_viterbi_decoding(self, input_energy, nwords):
+        # pred_ids: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`
         pred_ids, _ = crf_decode(input_energy, self.chain_kernel, nwords)
 
         return pred_ids
@@ -280,6 +283,8 @@ def get_config(self):
             self.use_boundary,
             "use_bias":
             self.use_bias,
+            "use_kernel":
+            self.use_kernel,
             "kernel_initializer":
             tf.keras.initializers.serialize(self.kernel_initializer),
             "chain_initializer":
@@ -316,7 +321,7 @@ def compute_output_shape(self, input_shape):
         output_shape = input_shape[:2]
         return output_shape
 
-    def compute_mask(self, input, mask=None):
+    def compute_mask(self, input_, mask=None):
         if mask is not None:
             # transform mask from shape (?, ?) to (?, )
             new_mask = tf.keras.backend.any(mask, axis=1)
@@ -358,5 +363,14 @@ def get_accuracy(self, y_true, y_pred):
                     tf.keras.backend.sum(mask))
 
     def _dense_layer(self, input_):
-        return self.activation(
-            tf.keras.backend.dot(input_, self.kernel) + self.bias)
+        if self.use_kernel:
+            output = self.activation(
+                tf.keras.backend.dot(input_, self.kernel) + self.bias)
+        else:
+            output = input_
+
+        return tf.keras.backend.cast(output, self.chain_kernel.dtype)
+
+    @property
+    def _compute_dtype(self):
+        return tf.int32
diff --git a/tensorflow_addons/layers/crf_test.py b/tensorflow_addons/layers/crf_test.py
index de37e74679..10959eb0b6 100644
--- a/tensorflow_addons/layers/crf_test.py
+++ b/tensorflow_addons/layers/crf_test.py
@@ -29,16 +29,16 @@ class TestCRF(tf.test.TestCase):
     def test_unmasked_viterbi_decode(self):
         x = np.array([
             [
-            #    O     B-X    I-X    B-Y    I-Y
-                [0.,    1.,   0.,     0.,   0.],
-                [0.,    0.,   1.,     0.,   0.],
-                [0.,    0.,   1.,     0.,   0.]
+                # O   B-X  I-X  B-Y  I-Y
+                [0.0, 1.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 1.0, 0.0, 0.0],
+                [0.0, 0.0, 1.0, 0.0, 0.0]
             ],
             [
-            #    O     B-X    I-X    B-Y    I-Y
-                [0.,    1.,   0.,     0.,   0.],
-                [0.,    1.,   0.,     0.,   0.],
-                [0.,    1.,   0.,     0.,   0.]
+                # O   B-X  I-X  B-Y  I-Y
+                [0.0, 1.0, 0.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0, 0.0]
             ]
         ])  # yapf: disable
 
@@ -48,8 +48,7 @@ def test_unmasked_viterbi_decode(self):
         ])  # yapf: disable
 
         transitions = np.ones([5, 5])
-        transitions_from_start = np.ones(5)
-        transitions_to_end = np.ones(5)
+        boundary_value = np.ones(5)
 
         test_utils.layer_test(
             CRF,
@@ -62,13 +61,13 @@ def test_unmasked_viterbi_decode(self):
                 tf.keras.initializers.Constant(transitions),
                 "use_boundary":
                 True,
-                "left_boundary_initializer":
-                tf.keras.initializers.Constant(transitions_from_start),
-                "right_boundary_initializer":
-                tf.keras.initializers.Constant(transitions_to_end),
+                "boundary_initializer":
+                tf.keras.initializers.Constant(boundary_value)
             },
             input_data=x,
             expected_output=expected_y,
+            expected_output_dtype=tf.int32,
+            validate_training=False
         )
 
 
diff --git a/tensorflow_addons/losses/BUILD b/tensorflow_addons/losses/BUILD
index 705340a902..c9a333d9eb 100644
--- a/tensorflow_addons/losses/BUILD
+++ b/tensorflow_addons/losses/BUILD
@@ -112,3 +112,16 @@ py_test(
         ":losses",
     ],
 )
+
+py_test(
+    name = "crf_loss_test",
+    size = "small",
+    srcs = [
+        "crf_loss.py",
+    ],
+    main = "crf_loss.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":losses",
+    ],
+)
diff --git a/tensorflow_addons/metrics/crf_accuracy.py b/tensorflow_addons/metrics/crf_accuracy.py
index e8d0c3b8b3..9f2f624d24 100644
--- a/tensorflow_addons/metrics/crf_accuracy.py
+++ b/tensorflow_addons/metrics/crf_accuracy.py
@@ -1,3 +1,23 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements Accuracy for Conditional Random Field."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 from tensorflow_addons.utils import keras_utils
 
 
From 18c569e35f2ff96a82b767520f91638b0ae59723 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 9 Oct 2019 20:22:23 +0800
Subject: [PATCH 17/52] Update code

---
 tensorflow_addons/layers/crf.py           | 128 ++++++++++++++++++----
 tensorflow_addons/layers/crf_test.py      |   3 +-
 tensorflow_addons/losses/BUILD            |   5 +-
 tensorflow_addons/losses/crf_loss.py      |  17 +--
 tensorflow_addons/losses/crf_loss_test.py |  91 +++++++++------
 5 files changed, 174 insertions(+), 70 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 3e47f04731..12c0f3a1a5 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -23,36 +23,111 @@
 import tensorflow as tf
 from tensorflow_addons.text.crf import crf_decode, crf_log_likelihood
 from tensorflow_addons.utils import keras_utils
-"""
-TODO
 
-* decide input_dim should be keep or drop
-* left padding of mask is not supported (future version should fix it)
-* not test yet if CRF is the first layer
-* Add docs
-"""
+# TODO
+#
+# * decide input_dim should be keep or drop
+# * [future version should fix it] left padding of mask is not supported
+# * not test yet if CRF is the first layer
 
 
 @keras_utils.register_keras_custom_object
 class CRF(tf.keras.layers.Layer):
+    """Linear chain conditional random field (CRF).
+
+    Examples:
+
+    ```python
+        from tensorflow_addons.layers import CRF
+
+        model = Sequential()
+        model.add(Embedding(3001, 300, mask_zero=True)
+
+        crf = CRF(10, name='crf_layer')
+        model.add(crf)
+
+        model.compile('adam', loss={'crf_layer': crf.loss})
+
+        model.fit(x, y)
+    ```
+
+    Arguments:
+        units: Positive integer, dimensionality of the output space,
+            should equal to tag num.
+        chain_initializer: Initializer for the `chain_kernel` weights matrix,
+            used for the CRF chain energy.
+            (see [initializers](../initializers.md)).
+        chain_regularizer: Regularizer function applied to
+            the `chain_kernel` weights matrix.
+        chain_constraint: Constraint function applied to
+            the `chain_kernel` weights matrix.
+        use_boundary: Boolean (default True), indicating if trainable
+            start-end chain energies should be added to model.
+        boundary_initializer: Initializer for the `left_boundary`,
+            'right_boundary' weights vectors,
+            used for the start/left and end/right boundary energy.
+        boundary_regularizer: Regularizer function applied to
+            the 'left_boundary', 'right_boundary' weight vectors.
+        boundary_constraint: Constraint function applied to
+            the `left_boundary`, `right_boundary` weights vectors.
+        use_kernel: Boolean (default True), indicating if apply
+            a fully connected layer before CRF op.
+        kernel_initializer: Initializer for the `kernel` weights matrix,
+            used for the linear transformation of the inputs.
+        kernel_regularizer: Regularizer function applied to
+            the `kernel` weights matrix.
+        kernel_constraint: Constraint function applied to
+            the `kernel` weights matrix.
+        use_bias: Boolean (default True), whether the layer uses a bias vector.
+        bias_initializer: Initializer for the bias vector.
+        bias_regularizer: Regularizer function applied to the bias vector.
+        bias_constraint: Constraint function applied to the bias vector.
+        activation: default value is 'linear', Activation function to use.
+        input_dim: dimensionality of the input (integer).
+            This argument (or the keyword argument `input_shape`)
+            is required when using this layer as the first layer in a model.
+
+    Input shape:
+        3D tensor with shape: `(batch_size, sequence_length, feature_size)`.
+
+    Output shape:
+        2D tensor (dtype: int32) with shape: `(batch_size, sequence_length)`.
+
+    Masking:
+        This layer supports masking
+        (2D tensor, shape: `(batch_size, sequence_length)`)
+        for input data with a variable number of timesteps.
+        This layer output same make tensor,
+        NOTICE this may cause issue when you
+        use some keras loss and metrics function which usually expect 1D mask.
+
+    Loss function:
+        Due to the TF 2.0 version support eager execution be default,
+        there is no way can implement CRF loss as independent loss function.
+        Thus, user should use loss method of this layer.
+        See Examples (above) for detailed usage.
+
+    References:
+        - [Conditional Random Field](https://en.wikipedia.org/wiki/Conditional_random_field)
+    """
     def __init__(self,
                  units,
-                 use_boundary=False,
-                 use_bias=True,
-                 use_kernel=True,
-                 activation="linear",
-                 kernel_initializer="glorot_uniform",
                  chain_initializer="orthogonal",
-                 bias_initializer="zeros",
-                 boundary_initializer="zeros",
-                 kernel_regularizer=None,
                  chain_regularizer=None,
-                 boundary_regularizer=None,
-                 bias_regularizer=None,
-                 kernel_constraint=None,
                  chain_constraint=None,
+                 use_boundary=True,
+                 boundary_initializer="zeros",
+                 boundary_regularizer=None,
                  boundary_constraint=None,
+                 use_kernel=True,
+                 kernel_initializer="glorot_uniform",
+                 kernel_regularizer=None,
+                 kernel_constraint=None,
+                 use_bias=True,
+                 bias_initializer="zeros",
+                 bias_regularizer=None,
                  bias_constraint=None,
+                 activation="linear",
                  input_dim=None,
                  **kwargs):
         super(CRF, self).__init__(**kwargs)
@@ -135,7 +210,7 @@ def build(self, input_shape):
         # bias that works with self.kernel
         if self.use_kernel and self.use_bias:
             self.bias = self.add_weight(
-                shape=(self.units, ),
+                shape=(self.units,),
                 name="bias",
                 initializer=self.bias_initializer,
                 regularizer=self.bias_regularizer,
@@ -147,14 +222,14 @@ def build(self, input_shape):
         # weight of <START> to tag probability and tag to <END> probability
         if self.use_boundary:
             self.left_boundary = self.add_weight(
-                shape=(self.units, ),
+                shape=(self.units,),
                 name="left_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
                 constraint=self.boundary_constraint,
             )
             self.right_boundary = self.add_weight(
-                shape=(self.units, ),
+                shape=(self.units,),
                 name="right_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
@@ -337,21 +412,26 @@ def get_decode_result(self, logits, mask):
         return pred_ids
 
     def get_negative_log_likelihood(self, y_true):
-        y_preds = self.logits
+        # Note: this y_pred is different from y_pred from loss function
+        y_pred = self.logits
 
         nwords = self.nwords
 
-        y_preds = tf.keras.backend.cast(y_preds, tf.float32)
+        y_pred = tf.keras.backend.cast(y_pred, tf.float32)
         y_true = tf.keras.backend.cast(y_true, tf.int32)
         nwords = tf.keras.backend.cast(nwords, tf.int32)
         self.chain_kernel = tf.keras.backend.cast(self.chain_kernel,
                                                   tf.float32)
 
-        log_likelihood, _ = crf_log_likelihood(y_preds, y_true, nwords,
+        log_likelihood, _ = crf_log_likelihood(y_pred, y_true, nwords,
                                                self.chain_kernel)
 
         return -log_likelihood
 
+    def loss(self, y_true, y_pred):
+        # we don't use y_pred, but caller pass it anyway
+        return self.get_negative_log_likelihood(y_true)
+
     def get_accuracy(self, y_true, y_pred):
         judge = tf.keras.backend.cast(
             tf.keras.backend.equal(y_pred, y_true), tf.keras.backend.floatx())
diff --git a/tensorflow_addons/layers/crf_test.py b/tensorflow_addons/layers/crf_test.py
index 10959eb0b6..be35a9c957 100644
--- a/tensorflow_addons/layers/crf_test.py
+++ b/tensorflow_addons/layers/crf_test.py
@@ -67,8 +67,7 @@ def test_unmasked_viterbi_decode(self):
             input_data=x,
             expected_output=expected_y,
             expected_output_dtype=tf.int32,
-            validate_training=False
-        )
+            validate_training=False)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow_addons/losses/BUILD b/tensorflow_addons/losses/BUILD
index c9a333d9eb..46f7019fb9 100644
--- a/tensorflow_addons/losses/BUILD
+++ b/tensorflow_addons/losses/BUILD
@@ -117,11 +117,12 @@ py_test(
     name = "crf_loss_test",
     size = "small",
     srcs = [
-        "crf_loss.py",
+        "crf_loss_test.py",
     ],
-    main = "crf_loss.py",
+    main = "crf_loss_test.py",
     srcs_version = "PY2AND3",
     deps = [
         ":losses",
+        "//tensorflow_addons/layers",
     ],
 )
diff --git a/tensorflow_addons/losses/crf_loss.py b/tensorflow_addons/losses/crf_loss.py
index 1dcdf95ce0..c1fa754c9d 100644
--- a/tensorflow_addons/losses/crf_loss.py
+++ b/tensorflow_addons/losses/crf_loss.py
@@ -22,22 +22,17 @@
 from tensorflow_addons.utils import keras_utils
 
 
-@keras_utils.register_keras_custom_object
-@tf.function
-def crf_loss(y_true, y_pred):
-    crf, idx = y_pred._keras_history[:2]
-
-    nloglik = crf.get_negative_log_likelihood(y_true)
-
-    return nloglik
-
-
 @keras_utils.register_keras_custom_object
 class ConditionalRandomFieldLoss(object):
+    def __init__(self, keras_model, crf_layer_name):
+        self.keras_model = keras_model
+        self.crf_layer_name = crf_layer_name
+
     def get_config(self):
         return {}
 
     def __call__(self, y_true, y_pred, sample_weight=None):
-        loss_vector = crf_loss(y_true, y_pred)
+        crf_layer = self.keras_model.get_layer(name=self.crf_layer_name)
+        loss_vector = crf_layer.loss(y_true, y_pred)
 
         return tf.keras.backend.mean(loss_vector)
diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_loss_test.py
index a80d055a4e..9c8689f727 100644
--- a/tensorflow_addons/losses/crf_loss_test.py
+++ b/tensorflow_addons/losses/crf_loss_test.py
@@ -31,7 +31,9 @@
 
 @test_utils.run_all_in_graph_and_eager_modes
 class ConditionalRandomFieldLossTest(tf.test.TestCase):
-    def test_forward_works_without_mask(self):
+    def setUp(self):
+        super(ConditionalRandomFieldLossTest, self).setUp()
+
         self.logits = np.array([
             [[0, 0, 0.5, 0.5, 0.2], [0, 0, 0.3, 0.3, 0.1], [0, 0, 0.9, 10, 1]],
             [[0, 0, 0.2, 0.5, 0.2], [0, 0, 3, 0.3, 0.1], [0, 0, 0.9, 1, 1]],
@@ -46,8 +48,7 @@ def test_forward_works_without_mask(self):
             [1.0, 1.0, 1.0, 1.0, 1.0],
         ])
 
-        self.transitions_from_start = np.array([0.1, 0.2, 0.3, 0.4, 0.6])
-        self.transitions_to_end = np.array([-0.1, -0.2, 0.3, -0.4, -0.4])
+        self.boundary_values = np.ones((5,))
 
         # Use the CRF Module with fixed transitions to compute the log_likelihood
         self.crf = CRF(
@@ -55,48 +56,76 @@ def test_forward_works_without_mask(self):
             use_kernel=False,  # disable kernel transform
             chain_initializer=tf.keras.initializers.Constant(self.transitions),
             use_boundary=True,
-            left_boundary_initializer=tf.keras.initializers.Constant(
-                self.transitions_from_start),
-            right_boundary_initializer=tf.keras.initializers.Constant(
-                self.transitions_to_end),
+            boundary_initializer=tf.keras.initializers.Constant(
+                self.boundary_values),
             name="crf_layer",
         )
 
-        crf_loss_instance = ConditionalRandomFieldLoss()
+    def score(self, logits, tags):
+        """Computes the likelihood score for the given sequence of tags, given
+        the provided logits (and the transition weights in the CRF model)"""
+        # Start with transitions from START and to END
+        total = self.boundary_values[tags[0]] + self.boundary_values[tags[-1]]
+        # Add in all the intermediate transitions
+        for tag, next_tag in zip(tags, tags[1:]):
+            total += self.transitions[tag, next_tag]
+        # Add in the logits for the observed tags
+        for logit, tag in zip(logits, tags):
+            total += logit[tag]
+        return total
+
+    def compute_log_likelihood(self):
+        # Now compute the log-likelihood manually
+        manual_log_likelihood = 0.0
+
+        # For each instance, manually compute the numerator
+        # (which is just the score for the logits and actual tags)
+        # and the denominator
+        # (which is the log-sum-exp of the scores
+        # for the logits across all possible tags)
+        for logits_i, tags_i in zip(self.logits, self.tags):
+            numerator = self.score(logits_i, tags_i)
+            all_scores = [
+                self.score(logits_i, tags_j)
+                for tags_j in itertools.product(range(5), repeat=3)
+            ]
+            denominator = math.log(
+                sum(math.exp(score) for score in all_scores))
+            # And include them in the manual calculation.
+            manual_log_likelihood += numerator - denominator
+
+        return manual_log_likelihood
+
+    def test_loss_function_as_crf_method(self):
+        model = tf.keras.models.Sequential()
+        model.add(tf.keras.layers.Input(shape=(3, 5)))
+        model.add(self.crf)
+        model.compile("adam", loss={"crf_layer": self.crf.loss})
+        model.summary()
+
+        log_likelihood = model.train_on_batch(self.logits, self.tags)
+
+        # The manually computed log likelihood should
+        # equal the result of crf.forward.
+        expected_log_likelihood = self.compute_log_likelihood()
+        unbatched_log_likelihood = -2 * log_likelihood
+
+        self.assertAllClose(expected_log_likelihood, unbatched_log_likelihood)
+
+    def test_loss_function_as_layer(self):
 
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
+        crf_loss_instance = ConditionalRandomFieldLoss(model, "crf_layer")
         model.compile("adam", loss={"crf_layer": crf_loss_instance})
         model.summary()
 
         log_likelihood = model.train_on_batch(self.logits, self.tags)
 
-        def compute_log_likelihood():
-            # Now compute the log-likelihood manually
-            manual_log_likelihood = 0.0
-
-            # For each instance, manually compute the numerator
-            # (which is just the score for the logits and actual tags)
-            # and the denominator
-            # (which is the log-sum-exp of the scores
-            # for the logits across all possible tags)
-            for logits_i, tags_i in zip(self.logits, self.tags):
-                numerator = self.score(logits_i, tags_i)
-                all_scores = [
-                    self.score(logits_i, tags_j)
-                    for tags_j in itertools.product(range(5), repeat=3)
-                ]
-                denominator = math.log(
-                    sum(math.exp(score) for score in all_scores))
-                # And include them in the manual calculation.
-                manual_log_likelihood += numerator - denominator
-
-            return manual_log_likelihood
-
         # The manually computed log likelihood should
         # equal the result of crf.forward.
-        expected_log_likelihood = compute_log_likelihood()
+        expected_log_likelihood = self.compute_log_likelihood()
         unbatched_log_likelihood = -2 * log_likelihood
 
         self.assertAllClose(expected_log_likelihood, unbatched_log_likelihood)

From 6a4c8fcee1890f02614cbe26ee69a974db01e8e3 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 9 Oct 2019 21:07:31 +0800
Subject: [PATCH 18/52] Remove input_dim

---
 tensorflow_addons/layers/crf.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 12c0f3a1a5..ca5714ab31 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -26,7 +26,6 @@
 
 # TODO
 #
-# * decide input_dim should be keep or drop
 # * [future version should fix it] left padding of mask is not supported
 # * not test yet if CRF is the first layer
 
@@ -83,9 +82,6 @@ class CRF(tf.keras.layers.Layer):
         bias_regularizer: Regularizer function applied to the bias vector.
         bias_constraint: Constraint function applied to the bias vector.
         activation: default value is 'linear', Activation function to use.
-        input_dim: dimensionality of the input (integer).
-            This argument (or the keyword argument `input_shape`)
-            is required when using this layer as the first layer in a model.
 
     Input shape:
         3D tensor with shape: `(batch_size, sequence_length, feature_size)`.
@@ -128,7 +124,6 @@ def __init__(self,
                  bias_regularizer=None,
                  bias_constraint=None,
                  activation="linear",
-                 input_dim=None,
                  **kwargs):
         super(CRF, self).__init__(**kwargs)
 
@@ -163,8 +158,6 @@ def __init__(self,
             boundary_constraint)
         self.bias_constraint = tf.keras.constraints.get(bias_constraint)
 
-        self.input_dim = input_dim
-
         # values will be assigned in method
         self.input_spec = None
 
@@ -186,12 +179,12 @@ def build(self, input_shape):
         # see API docs of InputSpec for more detail
         self.input_spec = [tf.keras.layers.InputSpec(shape=input_shape)]
 
-        self.input_dim = input_shape[-1]
+        feature_size = input_shape[-1]
 
         if self.use_kernel:
             # weights that mapping arbitrary tensor to correct shape
             self.kernel = self.add_weight(
-                shape=(self.input_dim, self.units),
+                shape=(feature_size, self.units),
                 name="kernel",
                 initializer=self.kernel_initializer,
                 regularizer=self.kernel_regularizer,
@@ -385,9 +378,7 @@ def get_config(self):
             "boundary_constraint":
             tf.keras.constraints.serialize(self.boundary_constraint),
             "bias_constraint":
-            tf.keras.constraints.serialize(self.bias_constraint),
-            "input_dim":
-            self.input_dim,
+            tf.keras.constraints.serialize(self.bias_constraint)
         }
         base_config = super(CRF, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))

From 28790d90ecfb513fe0d0f59ccc931e13717f88f1 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Thu, 10 Oct 2019 19:39:42 +0800
Subject: [PATCH 19/52] Update: using hack code to make CRF work again in TF
 2.0

---
 tensorflow_addons/layers/__init__.py      |   3 +-
 tensorflow_addons/layers/crf.py           | 185 +++++++++++++---------
 tensorflow_addons/losses/__init__.py      |   1 +
 tensorflow_addons/losses/crf_loss.py      |   6 +-
 tensorflow_addons/losses/crf_loss_test.py |  41 +++--
 5 files changed, 139 insertions(+), 97 deletions(-)

diff --git a/tensorflow_addons/layers/__init__.py b/tensorflow_addons/layers/__init__.py
index d527e16362..4f8c6585eb 100644
--- a/tensorflow_addons/layers/__init__.py
+++ b/tensorflow_addons/layers/__init__.py
@@ -25,4 +25,5 @@
 from tensorflow_addons.layers.optical_flow import CorrelationCost
 from tensorflow_addons.layers.poincare import PoincareNormalize
 from tensorflow_addons.layers.sparsemax import Sparsemax
-from tensorflow_addons.layers.wrappers import WeightNormalization
\ No newline at end of file
+from tensorflow_addons.layers.wrappers import WeightNormalization
+from tensorflow_addons.layers.crf import CRF
diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index ca5714ab31..503b3332f5 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -162,8 +162,8 @@ def __init__(self,
         self.input_spec = None
 
         # value remembered for loss/metrics function
-        self.logits = None
-        self.nwords = None
+        self.potentials = None
+        self.sequence_length = None
         self.mask = None
 
         # global variable
@@ -233,7 +233,7 @@ def build(self, input_shape):
         super(CRF, self).build(input_shape)
 
     def call(self, inputs, mask=None, **kwargs):
-        # mask: Tensor(shape=(?, ?), dtype=bool) or None
+        # mask: Tensor(shape=(batch_size, sequence_length), dtype=bool) or None
 
         if mask is not None:
             assert (tf.keras.backend.ndim(mask) == 2
@@ -242,63 +242,96 @@ def call(self, inputs, mask=None, **kwargs):
         # remember this value for later use
         self.mask = mask
 
-        logits = self._dense_layer(inputs)
+        self.potentials = self._dense_layer(inputs)
 
         # appending boundary probability info
         if self.use_boundary:
-            logits = self.add_boundary_energy(logits, mask, self.left_boundary,
-                                              self.right_boundary)
-
-        # remember this value for later use
-        self.logits = logits
-
-        nwords = self._get_nwords(inputs, mask)
+            self.potentials = self.add_boundary_energy(
+                self.potentials,
+                mask,
+                self.left_boundary,
+                self.right_boundary
+            )
 
-        # remember this value for later use
-        self.nwords = nwords
+        self.sequence_length = self._get_sequence_length(inputs, mask)
 
-        test_output = self.get_viterbi_decoding(logits, nwords)
+        decoded_sequence, _ = self.get_viterbi_decoding(self.potentials, self.sequence_length)
 
-        # WHY: don't remove this line, useless but remote it will cause bug, fix it later
-        # test_output = tf.cast(test_output, self.dtype or tf.keras.backend.floatx())
-        out = test_output
+        return decoded_sequence
 
-        return out
+    def _get_sequence_length(self, input_, mask):
+        """
+        Currently underline CRF fucntion (provided by tensorflow_addons.text.crf)
+        do not support bi-direction masking (left padding / right padding),
+        it support right padding by tell it the sequence length.
 
-    def _get_nwords(self, input_, mask):
+        this function is compute the sequence length from input and mask.
+        """
         if mask is not None:
             int_mask = tf.keras.backend.cast(mask, tf.int8)
-            nwords = self.mask_to_nwords(int_mask)
+            sequence_length = self.mask_to_sequence_length(int_mask)
         else:
-            # make a mask tensor from input, then used to generate nwords
+            # make a mask tensor from input, then used to generate sequence_length
             input_energy_shape = tf.shape(input_)
             raw_input_shape = tf.slice(input_energy_shape, [0], [2])
             alt_mask = tf.ones(raw_input_shape)
 
-            nwords = self.mask_to_nwords(alt_mask)
+            sequence_length = self.mask_to_sequence_length(alt_mask)
 
-        return nwords
+        return sequence_length
 
-    def mask_to_nwords(self, mask):
-        nwords = tf.keras.backend.cast(tf.keras.backend.sum(mask, 1), tf.int64)
-        return nwords
+    def mask_to_sequence_length(self, mask):
+        """
+        compute sequence length from mask
+        """
+        sequence_length = tf.keras.backend.cast(tf.keras.backend.sum(mask, 1), tf.int64)
+        return sequence_length
 
     @staticmethod
-    def shift_left(x, offset=1):
-        assert offset > 0
-        return tf.keras.backend.concatenate(
-            [x[:, offset:],
-             tf.keras.backend.zeros_like(x[:, :offset])],
+    def _compute_mask_right_boundary(mask):
+        """
+        input mask: 0011100, output left_boundary: 0000100
+        """
+        # shift mask to left by 1: 0011100 => 0111000
+        offset = 1
+        left_shifted_mask = tf.keras.backend.concatenate(
+            [mask[:, offset:],
+             tf.keras.backend.zeros_like(mask[:, :offset])],
             axis=1)
 
+        # TODO(howl-anderson): for below code
+        # Original code in keras_contrib:
+        # end_mask = K.cast(
+        #   K.greater(self.shift_left(mask), mask),
+        #   K.floatx()
+        # )
+        # May have a bug, it's better confirmed
+        # by the original keras_contrib maintainer
+        # Luiz Felix (github: lzfelix),
+        # mailed him already and waiting for reply.
+
+        # 0011100 > 0111000 => 0000100
+        right_boundary = tf.keras.backend.greater(mask, left_shifted_mask)
+
+        return right_boundary
+
     @staticmethod
-    def shift_right(x, offset=1):
-        assert offset > 0
-        return tf.keras.backend.concatenate(
-            [tf.keras.backend.zeros_like(x[:, :offset]), x[:, :-offset]],
+    def _compute_mask_left_boundary(mask):
+        """
+        input mask: 0011100, output left_boundary: 0010000
+        """
+        # shift mask to right by 1: 0011100 => 0001110
+        offset = 1
+        right_shifted_mask = tf.keras.backend.concatenate(
+            [tf.keras.backend.zeros_like(mask[:, :offset]), mask[:, :-offset]],
             axis=1)
 
-    def add_boundary_energy(self, energy, mask, start, end):
+        # 0011100 > 0001110 => 0010000
+        left_boundary = tf.keras.backend.greater(mask, right_shifted_mask)
+
+        return left_boundary
+
+    def add_boundary_energy(self, potentials, mask, start, end):
         def expend_scalar_to_3d(x):
             # expend tensor from shape (x, ) to (1, 1, x)
             return tf.keras.backend.expand_dims(
@@ -307,40 +340,31 @@ def expend_scalar_to_3d(x):
         start = expend_scalar_to_3d(start)
         end = expend_scalar_to_3d(end)
         if mask is None:
-            energy = tf.keras.backend.concatenate(
-                [energy[:, :1, :] + start, energy[:, 1:, :]], axis=1)
-            energy = tf.keras.backend.concatenate(
-                [energy[:, :-1, :], energy[:, -1:, :] + end], axis=1)
+            potentials = tf.keras.backend.concatenate(
+                [potentials[:, :1, :] + start, potentials[:, 1:, :]], axis=1)
+            potentials = tf.keras.backend.concatenate(
+                [potentials[:, :-1, :], potentials[:, -1:, :] + end], axis=1)
         else:
             mask = tf.keras.backend.expand_dims(
                 tf.keras.backend.cast(mask, start.dtype), axis=-1)
             start_mask = tf.keras.backend.cast(
-                tf.keras.backend.greater(mask, self.shift_right(mask)),
+                self._compute_mask_left_boundary(mask),
                 start.dtype,
             )
 
-            # original code:
-            # end_mask = K.cast(
-            #   K.greater(self.shift_left(mask), mask),
-            #   K.floatx()
-            # )
-            # Note: original code should have a bug,
-            # need confirmed with @lzfelix (Luiz Felix),
-            # mailed to him but no reply for months,
-            # patch applied here.
             end_mask = tf.keras.backend.cast(
-                tf.keras.backend.greater(mask, self.shift_left(mask)),
+                self._compute_mask_right_boundary(mask),
                 end.dtype,
             )
-            energy = energy + start_mask * start
-            energy = energy + end_mask * end
-        return energy
+            potentials = potentials + start_mask * start
+            potentials = potentials + end_mask * end
+        return potentials
 
-    def get_viterbi_decoding(self, input_energy, nwords):
-        # pred_ids: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`
-        pred_ids, _ = crf_decode(input_energy, self.chain_kernel, nwords)
+    def get_viterbi_decoding(self, potentials, sequence_length):
+        # decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`
+        decode_tags, best_score = crf_decode(potentials, self.chain_kernel, sequence_length)
 
-        return pred_ids
+        return decode_tags, best_score
 
     def get_config(self):
         # used for loading model from disk
@@ -388,6 +412,13 @@ def compute_output_shape(self, input_shape):
         return output_shape
 
     def compute_mask(self, input_, mask=None):
+        """
+        Set output mask to be 1D tensor, so loss method of this class can work without error.
+        But there is big short come:
+        layer, loss and metrics after this layer
+        can not access meaningful mask. Which mean they can not work correctly.
+        User only can get correct loss and metrics value from methods of this layer.
+        """
         if mask is not None:
             # transform mask from shape (?, ?) to (?, )
             new_mask = tf.keras.backend.any(mask, axis=1)
@@ -395,35 +426,24 @@ def compute_mask(self, input_, mask=None):
 
         return mask
 
-    def get_decode_result(self, logits, mask):
-        nwords = tf.keras.backend.cast(tf.keras.backend.sum(mask, 1), tf.int64)
-
-        pred_ids, _ = crf_decode(logits, self.chain_kernel, nwords)
-
-        return pred_ids
-
     def get_negative_log_likelihood(self, y_true):
-        # Note: this y_pred is different from y_pred from loss function
-        y_pred = self.logits
-
-        nwords = self.nwords
-
-        y_pred = tf.keras.backend.cast(y_pred, tf.float32)
+        # TODO: remove typing cast
+        self.potentials = tf.keras.backend.cast(self.potentials, tf.float32)
         y_true = tf.keras.backend.cast(y_true, tf.int32)
-        nwords = tf.keras.backend.cast(nwords, tf.int32)
-        self.chain_kernel = tf.keras.backend.cast(self.chain_kernel,
-                                                  tf.float32)
+        self.sequence_length = tf.keras.backend.cast(self.sequence_length, tf.int32)
+        # self.chain_kernel = tf.keras.backend.cast(self.chain_kernel,
+        #                                           tf.float32)
 
-        log_likelihood, _ = crf_log_likelihood(y_pred, y_true, nwords,
+        log_likelihood, _ = crf_log_likelihood(self.potentials, y_true, self.sequence_length,
                                                self.chain_kernel)
 
         return -log_likelihood
 
     def loss(self, y_true, y_pred):
-        # we don't use y_pred, but caller pass it anyway
+        # we don't use y_pred, but caller pass it anyway, ignore it
         return self.get_negative_log_likelihood(y_true)
 
-    def get_accuracy(self, y_true, y_pred):
+    def accuracy(self, y_true, y_pred):
         judge = tf.keras.backend.cast(
             tf.keras.backend.equal(y_pred, y_true), tf.keras.backend.floatx())
         if self.mask is None:
@@ -442,6 +462,17 @@ def _dense_layer(self, input_):
 
         return tf.keras.backend.cast(output, self.chain_kernel.dtype)
 
+    def __call__(self, inputs, *args, **kwargs):
+        outputs = super(CRF, self).__call__(inputs, *args, **kwargs)
+
+        # A hack that add _keras_history to EagerTensor, make it more like normal Tensor
+        for tensor in tf.nest.flatten(outputs):
+            if not hasattr(tensor, '_keras_history'):
+                tensor._keras_history = (self, 0, 0)
+
+        return outputs
+
     @property
     def _compute_dtype(self):
+        # fixed output dtype from underline CRF functions
         return tf.int32
diff --git a/tensorflow_addons/losses/__init__.py b/tensorflow_addons/losses/__init__.py
index ff8e5094fa..88cf658d7b 100644
--- a/tensorflow_addons/losses/__init__.py
+++ b/tensorflow_addons/losses/__init__.py
@@ -24,3 +24,4 @@
 from tensorflow_addons.losses.npairs import npairs_loss, NpairsLoss, npairs_multilabel_loss, NpairsMultilabelLoss
 from tensorflow_addons.losses.sparsemax_loss import sparsemax_loss, SparsemaxLoss
 from tensorflow_addons.losses.triplet import triplet_semihard_loss, TripletSemiHardLoss
+from tensorflow_addons.losses.crf_loss import ConditionalRandomFieldLoss
diff --git a/tensorflow_addons/losses/crf_loss.py b/tensorflow_addons/losses/crf_loss.py
index c1fa754c9d..ff4b4e1720 100644
--- a/tensorflow_addons/losses/crf_loss.py
+++ b/tensorflow_addons/losses/crf_loss.py
@@ -24,15 +24,11 @@
 
 @keras_utils.register_keras_custom_object
 class ConditionalRandomFieldLoss(object):
-    def __init__(self, keras_model, crf_layer_name):
-        self.keras_model = keras_model
-        self.crf_layer_name = crf_layer_name
-
     def get_config(self):
         return {}
 
     def __call__(self, y_true, y_pred, sample_weight=None):
-        crf_layer = self.keras_model.get_layer(name=self.crf_layer_name)
+        crf_layer = y_pred._keras_history[0]
         loss_vector = crf_layer.loss(y_true, y_pred)
 
         return tf.keras.backend.mean(loss_vector)
diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_loss_test.py
index 9c8689f727..70473a4257 100644
--- a/tensorflow_addons/losses/crf_loss_test.py
+++ b/tensorflow_addons/losses/crf_loss_test.py
@@ -20,6 +20,7 @@
 
 import itertools
 import math
+import os
 
 import numpy as np
 import tensorflow as tf
@@ -48,7 +49,7 @@ def setUp(self):
             [1.0, 1.0, 1.0, 1.0, 1.0],
         ])
 
-        self.boundary_values = np.ones((5,))
+        self.boundary_values = np.ones((5, ))
 
         # Use the CRF Module with fixed transitions to compute the log_likelihood
         self.crf = CRF(
@@ -96,12 +97,12 @@ def compute_log_likelihood(self):
 
         return manual_log_likelihood
 
-    def test_loss_function_as_crf_method(self):
+    def test_loss_function(self):
+
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
-        model.compile("adam", loss={"crf_layer": self.crf.loss})
-        model.summary()
+        model.compile("adam", loss={"crf_layer": ConditionalRandomFieldLoss()})
 
         log_likelihood = model.train_on_batch(self.logits, self.tags)
 
@@ -112,23 +113,35 @@ def test_loss_function_as_crf_method(self):
 
         self.assertAllClose(expected_log_likelihood, unbatched_log_likelihood)
 
-    def test_loss_function_as_layer(self):
+    def test_model_fit(self):
+        model = tf.keras.models.Sequential()
+        model.add(tf.keras.layers.Input(shape=(3, 5)))
+        model.add(self.crf)
+        model.compile("adam", loss={"crf_layer": ConditionalRandomFieldLoss()})
+
+        model.fit(self.logits, self.tags, epochs=10, batch_size=1)
+
+    def test_dump_and_load(self):
+        MODEL_PERSISTENCE_PATH = './test_saving_crf_model.h5'
 
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
-        crf_loss_instance = ConditionalRandomFieldLoss(model, "crf_layer")
-        model.compile("adam", loss={"crf_layer": crf_loss_instance})
-        model.summary()
+        model.compile("adam", loss={"crf_layer": ConditionalRandomFieldLoss()})
 
-        log_likelihood = model.train_on_batch(self.logits, self.tags)
+        model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
-        # The manually computed log likelihood should
-        # equal the result of crf.forward.
-        expected_log_likelihood = self.compute_log_likelihood()
-        unbatched_log_likelihood = -2 * log_likelihood
+        model.save(MODEL_PERSISTENCE_PATH)
+        new_model = tf.keras.models.load_model(MODEL_PERSISTENCE_PATH)
 
-        self.assertAllClose(expected_log_likelihood, unbatched_log_likelihood)
+        new_model.fit(self.logits, self.tags, epochs=10, batch_size=1)
+
+        tf.keras.models.load_model(MODEL_PERSISTENCE_PATH)
+
+        try:
+            os.remove(MODEL_PERSISTENCE_PATH)
+        except OSError:
+            pass
 
 
 if __name__ == "__main__":

From 7040e3f94d1ec93537b26c8a1ff57d1ace379da1 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Fri, 11 Oct 2019 16:37:50 +0800
Subject: [PATCH 20/52] All code pasted CI testing

---
 tensorflow_addons/layers/crf.py               |  28 ++---
 tensorflow_addons/losses/crf_loss.py          |   2 +-
 tensorflow_addons/metrics/BUILD               |  15 +++
 tensorflow_addons/metrics/__init__.py         |   1 +
 tensorflow_addons/metrics/crf_accuracy.py     |  70 +++++++++++-
 .../metrics/crf_accuracy_test.py              | 102 ++++++++++++++++++
 6 files changed, 200 insertions(+), 18 deletions(-)
 create mode 100644 tensorflow_addons/metrics/crf_accuracy_test.py

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 503b3332f5..49a5107586 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -26,7 +26,7 @@
 
 # TODO
 #
-# * [future version should fix it] left padding of mask is not supported
+# * [future version should fix it] left padding of mask is not supported, detect it and report to user
 # * not test yet if CRF is the first layer
 
 
@@ -412,17 +412,17 @@ def compute_output_shape(self, input_shape):
         return output_shape
 
     def compute_mask(self, input_, mask=None):
-        """
-        Set output mask to be 1D tensor, so loss method of this class can work without error.
-        But there is big short come:
-        layer, loss and metrics after this layer
-        can not access meaningful mask. Which mean they can not work correctly.
-        User only can get correct loss and metrics value from methods of this layer.
-        """
-        if mask is not None:
-            # transform mask from shape (?, ?) to (?, )
-            new_mask = tf.keras.backend.any(mask, axis=1)
-            return new_mask
+        # """
+        # Set output mask to be 1D tensor, so loss method of this class can work without error.
+        # But there is big short come:
+        # layer, loss and metrics after this layer
+        # can not access meaningful mask. Which mean they can not work correctly.
+        # User only can get correct loss and metrics value from methods of this layer.
+        # """
+        # if mask is not None:
+        #     # transform mask from shape (?, ?) to (?, )
+        #     new_mask = tf.keras.backend.any(mask, axis=1)
+        #     return new_mask
 
         return mask
 
@@ -439,11 +439,11 @@ def get_negative_log_likelihood(self, y_true):
 
         return -log_likelihood
 
-    def loss(self, y_true, y_pred):
+    def get_loss(self, y_true, y_pred):
         # we don't use y_pred, but caller pass it anyway, ignore it
         return self.get_negative_log_likelihood(y_true)
 
-    def accuracy(self, y_true, y_pred):
+    def get_accuracy(self, y_true, y_pred):
         judge = tf.keras.backend.cast(
             tf.keras.backend.equal(y_pred, y_true), tf.keras.backend.floatx())
         if self.mask is None:
diff --git a/tensorflow_addons/losses/crf_loss.py b/tensorflow_addons/losses/crf_loss.py
index ff4b4e1720..6e9be1b7ae 100644
--- a/tensorflow_addons/losses/crf_loss.py
+++ b/tensorflow_addons/losses/crf_loss.py
@@ -29,6 +29,6 @@ def get_config(self):
 
     def __call__(self, y_true, y_pred, sample_weight=None):
         crf_layer = y_pred._keras_history[0]
-        loss_vector = crf_layer.loss(y_true, y_pred)
+        loss_vector = crf_layer.get_loss(y_true, y_pred)
 
         return tf.keras.backend.mean(loss_vector)
diff --git a/tensorflow_addons/metrics/BUILD b/tensorflow_addons/metrics/BUILD
index f1d32a6212..a0b0040998 100644
--- a/tensorflow_addons/metrics/BUILD
+++ b/tensorflow_addons/metrics/BUILD
@@ -83,3 +83,18 @@ py_test(
         ":metrics",
     ],
 )
+
+py_test(
+    name = "crf_accuracy_test",
+    size = "small",
+    srcs = [
+        "crf_accuracy_test.py",
+    ],
+    main = "crf_accuracy_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":metrics",
+        "//tensorflow_addons/layers",
+        "//tensorflow_addons/losses",
+    ],
+)
diff --git a/tensorflow_addons/metrics/__init__.py b/tensorflow_addons/metrics/__init__.py
index 83da2da2a8..e46e691a56 100755
--- a/tensorflow_addons/metrics/__init__.py
+++ b/tensorflow_addons/metrics/__init__.py
@@ -22,3 +22,4 @@
 from tensorflow_addons.metrics.f_scores import F1Score, FBetaScore
 from tensorflow_addons.metrics.r_square import RSquare
 from tensorflow_addons.metrics.multilabel_confusion_matrix import MultiLabelConfusionMatrix
+from tensorflow_addons.metrics.crf_accuracy import _crf_accuracy
diff --git a/tensorflow_addons/metrics/crf_accuracy.py b/tensorflow_addons/metrics/crf_accuracy.py
index 9f2f624d24..4c246bfd4b 100644
--- a/tensorflow_addons/metrics/crf_accuracy.py
+++ b/tensorflow_addons/metrics/crf_accuracy.py
@@ -18,10 +18,74 @@
 from __future__ import division
 from __future__ import print_function
 
+import types
+
+import tensorflow as tf
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.utils import metrics_utils
+from tensorflow.python.ops.losses import util as tf_losses_utils
+
 from tensorflow_addons.utils import keras_utils
 
 
+def _crf_accuracy(y_true, y_pred):
+    crf_layer = y_pred._keras_history[0]
+    return crf_layer.get_accuracy(y_true, y_pred)
+
+
 @keras_utils.register_keras_custom_object
-def crf_accuracy(y_true, y_pred):
-    crf, idx = y_pred._keras_history[:2]
-    return crf.get_accuracy(y_true, y_pred)
+class ConditionalRandomFieldAccuracy(tf.keras.metrics.Mean):
+    """Wraps a stateless metric function with the Mean metric."""
+
+    def __init__(self, name='crf_accuracy', dtype=None):
+        """Creates a `MeanMetricWrapper` instance.
+
+        Args:
+          fn: The metric function to wrap, with signature
+            `fn(y_true, y_pred, **kwargs)`.
+          name: (Optional) string name of the metric instance.
+          dtype: (Optional) data type of the metric result.
+          **kwargs: The keyword arguments that are passed on to `fn`.
+        """
+        super(ConditionalRandomFieldAccuracy, self).__init__(
+            name=name, dtype=dtype)
+
+        self._fn = _crf_accuracy
+
+    def __new__(cls, *args, **kwargs):
+        obj = Layer.__new__(cls)
+
+        # A hack here, origianl base class (tf.keras.metrics.Metric)
+        # will convert update_state using tf.function
+        # but which will cause problem related to _keras_history
+        update_state_fn = obj.update_state
+
+        obj.update_state = types.MethodType(
+            metrics_utils.update_state_wrapper(update_state_fn), obj)
+        obj.result = types.MethodType(
+            metrics_utils.result_wrapper(obj.result), obj)
+        return obj
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        y_true = tf.dtypes.cast(y_true, self._dtype)
+
+        # cast operation will drop _keras_history info, which is vital to this metrics
+        # so, store it and then restore it later
+        y_pred_keras_history = y_pred._keras_history
+        y_pred = tf.dtypes.cast(y_pred, self._dtype)
+        y_pred._keras_history = y_pred_keras_history
+
+        [y_true, y_pred], sample_weight = \
+            metrics_utils.ragged_assert_compatible_and_get_flat_values(
+                [y_true, y_pred], sample_weight)
+        y_pred, y_true = tf_losses_utils.squeeze_or_expand_dimensions(
+            y_pred, y_true)
+
+        matches = self._fn(y_true, y_pred)
+        return super(ConditionalRandomFieldAccuracy, self).update_state(
+            matches, sample_weight=sample_weight)
+
+    def get_config(self):
+        config = {}
+        base_config = super(ConditionalRandomFieldAccuracy, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow_addons/metrics/crf_accuracy_test.py b/tensorflow_addons/metrics/crf_accuracy_test.py
new file mode 100644
index 0000000000..6eb9c2764b
--- /dev/null
+++ b/tensorflow_addons/metrics/crf_accuracy_test.py
@@ -0,0 +1,102 @@
+## Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Conditional Random Field loss."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow_addons.layers.crf import CRF
+from tensorflow_addons.losses.crf_loss import ConditionalRandomFieldLoss
+from tensorflow_addons.metrics.crf_accuracy import ConditionalRandomFieldAccuracy
+from tensorflow_addons.utils import test_utils
+
+
+@test_utils.run_all_in_graph_and_eager_modes
+class ConditionalRandomFieldAccuracyTest(tf.test.TestCase):
+    def setUp(self):
+        super(ConditionalRandomFieldAccuracyTest, self).setUp()
+
+        self.logits = np.array([
+            [[0, 0, 0.5, 0.5, 0.2], [0, 0, 0.3, 0.3, 0.1], [0, 0, 0.9, 10, 1]],
+            [[0, 0, 0.2, 0.5, 0.2], [0, 0, 3, 0.3, 0.1], [0, 0, 0.9, 1, 1]],
+        ])
+        self.tags = np.array([[2, 3, 4], [3, 2, 2]])
+
+        self.transitions = np.array([
+            [0.1, 0.2, 0.3, 0.4, 0.5],
+            [0.8, 0.3, 0.1, 0.7, 0.9],
+            [-0.3, 2.1, -5.6, 3.4, 4.0],
+            [0.2, 0.4, 0.6, -0.3, -0.4],
+            [1.0, 1.0, 1.0, 1.0, 1.0],
+        ])
+
+        self.boundary_values = np.ones((5, ))
+
+        # Use the CRF Module with fixed transitions to compute the log_likelihood
+        self.crf = CRF(
+            units=5,
+            use_kernel=False,  # disable kernel transform
+            chain_initializer=tf.keras.initializers.Constant(self.transitions),
+            use_boundary=True,
+            boundary_initializer=tf.keras.initializers.Constant(
+                self.boundary_values),
+            name="crf_layer",
+        )
+
+    def test_model_fit(self):
+        model = tf.keras.models.Sequential()
+        model.add(tf.keras.layers.Input(shape=(3, 5)))
+        model.add(self.crf)
+        model.compile(
+            "adam",
+            loss={"crf_layer": ConditionalRandomFieldLoss()},
+            metrics=[ConditionalRandomFieldAccuracy()])
+
+        model.fit(self.logits, self.tags, epochs=10, batch_size=1)
+
+    def test_dump_and_load(self):
+        MODEL_PERSISTENCE_PATH = './test_saving_crf_model.h5'
+
+        model = tf.keras.models.Sequential()
+        model.add(tf.keras.layers.Input(shape=(3, 5)))
+        model.add(self.crf)
+        model.compile(
+            "adam",
+            loss={"crf_layer": ConditionalRandomFieldLoss()},
+            metrics=[ConditionalRandomFieldAccuracy()])
+
+        model.fit(self.logits, self.tags, epochs=10, batch_size=1)
+
+        model.save(MODEL_PERSISTENCE_PATH)
+        new_model = tf.keras.models.load_model(MODEL_PERSISTENCE_PATH)
+
+        new_model.fit(self.logits, self.tags, epochs=10, batch_size=1)
+
+        tf.keras.models.load_model(MODEL_PERSISTENCE_PATH)
+
+        try:
+            os.remove(MODEL_PERSISTENCE_PATH)
+        except OSError:
+            pass
+
+
+if __name__ == "__main__":
+    tf.test.main()

From 7822d2000a04cb35aba0c2d81e252732fe43c442 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Fri, 11 Oct 2019 20:14:42 +0800
Subject: [PATCH 21/52] Update

---
 tensorflow_addons/layers/crf.py               |  14 ++-
 tensorflow_addons/layers/crf_test.py          |   6 ++
 tensorflow_addons/losses/crf_loss.py          |   8 ++
 tensorflow_addons/losses/crf_loss_test.py     |  19 +++-
 tensorflow_addons/metrics/BUILD               |  16 ---
 tensorflow_addons/metrics/__init__.py         |   1 -
 tensorflow_addons/metrics/crf_accuracy.py     |  91 ----------------
 .../metrics/crf_accuracy_test.py              | 102 ------------------
 8 files changed, 38 insertions(+), 219 deletions(-)
 delete mode 100644 tensorflow_addons/metrics/crf_accuracy.py
 delete mode 100644 tensorflow_addons/metrics/crf_accuracy_test.py

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 49a5107586..d5bd5c48b3 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -24,11 +24,6 @@
 from tensorflow_addons.text.crf import crf_decode, crf_log_likelihood
 from tensorflow_addons.utils import keras_utils
 
-# TODO
-#
-# * [future version should fix it] left padding of mask is not supported, detect it and report to user
-# * not test yet if CRF is the first layer
-
 
 @keras_utils.register_keras_custom_object
 class CRF(tf.keras.layers.Layer):
@@ -239,6 +234,15 @@ def call(self, inputs, mask=None, **kwargs):
             assert (tf.keras.backend.ndim(mask) == 2
                     ), "Input mask to CRF must have dim 2 if not None"
 
+        # left padding of mask is not supported, due the underline CRF function
+        # detect it and report it to user
+        # if mask is not None:
+        #     left_boundary_mask = self._compute_mask_left_boundary(mask)
+        #     first_mask = left_boundary_mask[0]
+        #
+        #     if not first_mask:
+        #         raise ValueError("Currently, CRF layer don't support left padding")
+
         # remember this value for later use
         self.mask = mask
 
diff --git a/tensorflow_addons/layers/crf_test.py b/tensorflow_addons/layers/crf_test.py
index be35a9c957..703add0b15 100644
--- a/tensorflow_addons/layers/crf_test.py
+++ b/tensorflow_addons/layers/crf_test.py
@@ -26,6 +26,12 @@
 
 @test_utils.run_all_in_graph_and_eager_modes
 class TestCRF(tf.test.TestCase):
+    # def test_left_padding(self):
+    #     test_utils.layer_test(
+    #         CRF,
+    #         kwargs={"units": 4, "mask": np.array([0, 1, 1])},
+    #         validate_training=False)
+
     def test_unmasked_viterbi_decode(self):
         x = np.array([
             [
diff --git a/tensorflow_addons/losses/crf_loss.py b/tensorflow_addons/losses/crf_loss.py
index 6e9be1b7ae..51cf9d9e60 100644
--- a/tensorflow_addons/losses/crf_loss.py
+++ b/tensorflow_addons/losses/crf_loss.py
@@ -19,6 +19,8 @@
 from __future__ import print_function
 
 import tensorflow as tf
+
+from tensorflow_addons.layers import CRF
 from tensorflow_addons.utils import keras_utils
 
 
@@ -29,6 +31,12 @@ def get_config(self):
 
     def __call__(self, y_true, y_pred, sample_weight=None):
         crf_layer = y_pred._keras_history[0]
+
+        # check if last layer is CRF
+        if not isinstance(crf_layer, CRF):
+            raise ValueError('Last layer must be CRF for use {}.'.format(
+                self.__class__.__name__))
+
         loss_vector = crf_layer.get_loss(y_true, y_pred)
 
         return tf.keras.backend.mean(loss_vector)
diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_loss_test.py
index 70473a4257..9b34061af1 100644
--- a/tensorflow_addons/losses/crf_loss_test.py
+++ b/tensorflow_addons/losses/crf_loss_test.py
@@ -29,6 +29,8 @@
 from tensorflow_addons.losses.crf_loss import ConditionalRandomFieldLoss
 from tensorflow_addons.utils import test_utils
 
+# TODO(howl-anderson):  test CRF as the first layer
+
 
 @test_utils.run_all_in_graph_and_eager_modes
 class ConditionalRandomFieldLossTest(tf.test.TestCase):
@@ -102,9 +104,12 @@ def test_loss_function(self):
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
-        model.compile("adam", loss={"crf_layer": ConditionalRandomFieldLoss()})
+        model.compile(
+            "adam",
+            loss={"crf_layer": ConditionalRandomFieldLoss()},
+            metrics=[tf.keras.metrics.Accuracy()])
 
-        log_likelihood = model.train_on_batch(self.logits, self.tags)
+        log_likelihood, _ = model.train_on_batch(self.logits, self.tags)
 
         # The manually computed log likelihood should
         # equal the result of crf.forward.
@@ -117,7 +122,10 @@ def test_model_fit(self):
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
-        model.compile("adam", loss={"crf_layer": ConditionalRandomFieldLoss()})
+        model.compile(
+            "adam",
+            loss={"crf_layer": ConditionalRandomFieldLoss()},
+            metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
@@ -127,7 +135,10 @@ def test_dump_and_load(self):
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
-        model.compile("adam", loss={"crf_layer": ConditionalRandomFieldLoss()})
+        model.compile(
+            "adam",
+            loss={"crf_layer": ConditionalRandomFieldLoss()},
+            metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
diff --git a/tensorflow_addons/metrics/BUILD b/tensorflow_addons/metrics/BUILD
index a0b0040998..33db5e905f 100644
--- a/tensorflow_addons/metrics/BUILD
+++ b/tensorflow_addons/metrics/BUILD
@@ -7,7 +7,6 @@ py_library(
     srcs = [
         "__init__.py",
         "cohens_kappa.py",
-        "crf_accuracy.py",
         "f_scores.py",
         "multilabel_confusion_matrix.py",
         "r_square.py",
@@ -83,18 +82,3 @@ py_test(
         ":metrics",
     ],
 )
-
-py_test(
-    name = "crf_accuracy_test",
-    size = "small",
-    srcs = [
-        "crf_accuracy_test.py",
-    ],
-    main = "crf_accuracy_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":metrics",
-        "//tensorflow_addons/layers",
-        "//tensorflow_addons/losses",
-    ],
-)
diff --git a/tensorflow_addons/metrics/__init__.py b/tensorflow_addons/metrics/__init__.py
index e46e691a56..83da2da2a8 100755
--- a/tensorflow_addons/metrics/__init__.py
+++ b/tensorflow_addons/metrics/__init__.py
@@ -22,4 +22,3 @@
 from tensorflow_addons.metrics.f_scores import F1Score, FBetaScore
 from tensorflow_addons.metrics.r_square import RSquare
 from tensorflow_addons.metrics.multilabel_confusion_matrix import MultiLabelConfusionMatrix
-from tensorflow_addons.metrics.crf_accuracy import _crf_accuracy
diff --git a/tensorflow_addons/metrics/crf_accuracy.py b/tensorflow_addons/metrics/crf_accuracy.py
deleted file mode 100644
index 4c246bfd4b..0000000000
--- a/tensorflow_addons/metrics/crf_accuracy.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implements Accuracy for Conditional Random Field."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import types
-
-import tensorflow as tf
-from tensorflow.python.keras.engine.base_layer import Layer
-from tensorflow.python.keras.utils import metrics_utils
-from tensorflow.python.ops.losses import util as tf_losses_utils
-
-from tensorflow_addons.utils import keras_utils
-
-
-def _crf_accuracy(y_true, y_pred):
-    crf_layer = y_pred._keras_history[0]
-    return crf_layer.get_accuracy(y_true, y_pred)
-
-
-@keras_utils.register_keras_custom_object
-class ConditionalRandomFieldAccuracy(tf.keras.metrics.Mean):
-    """Wraps a stateless metric function with the Mean metric."""
-
-    def __init__(self, name='crf_accuracy', dtype=None):
-        """Creates a `MeanMetricWrapper` instance.
-
-        Args:
-          fn: The metric function to wrap, with signature
-            `fn(y_true, y_pred, **kwargs)`.
-          name: (Optional) string name of the metric instance.
-          dtype: (Optional) data type of the metric result.
-          **kwargs: The keyword arguments that are passed on to `fn`.
-        """
-        super(ConditionalRandomFieldAccuracy, self).__init__(
-            name=name, dtype=dtype)
-
-        self._fn = _crf_accuracy
-
-    def __new__(cls, *args, **kwargs):
-        obj = Layer.__new__(cls)
-
-        # A hack here, origianl base class (tf.keras.metrics.Metric)
-        # will convert update_state using tf.function
-        # but which will cause problem related to _keras_history
-        update_state_fn = obj.update_state
-
-        obj.update_state = types.MethodType(
-            metrics_utils.update_state_wrapper(update_state_fn), obj)
-        obj.result = types.MethodType(
-            metrics_utils.result_wrapper(obj.result), obj)
-        return obj
-
-    def update_state(self, y_true, y_pred, sample_weight=None):
-        y_true = tf.dtypes.cast(y_true, self._dtype)
-
-        # cast operation will drop _keras_history info, which is vital to this metrics
-        # so, store it and then restore it later
-        y_pred_keras_history = y_pred._keras_history
-        y_pred = tf.dtypes.cast(y_pred, self._dtype)
-        y_pred._keras_history = y_pred_keras_history
-
-        [y_true, y_pred], sample_weight = \
-            metrics_utils.ragged_assert_compatible_and_get_flat_values(
-                [y_true, y_pred], sample_weight)
-        y_pred, y_true = tf_losses_utils.squeeze_or_expand_dimensions(
-            y_pred, y_true)
-
-        matches = self._fn(y_true, y_pred)
-        return super(ConditionalRandomFieldAccuracy, self).update_state(
-            matches, sample_weight=sample_weight)
-
-    def get_config(self):
-        config = {}
-        base_config = super(ConditionalRandomFieldAccuracy, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow_addons/metrics/crf_accuracy_test.py b/tensorflow_addons/metrics/crf_accuracy_test.py
deleted file mode 100644
index 6eb9c2764b..0000000000
--- a/tensorflow_addons/metrics/crf_accuracy_test.py
+++ /dev/null
@@ -1,102 +0,0 @@
-## Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Conditional Random Field loss."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-import tensorflow as tf
-
-from tensorflow_addons.layers.crf import CRF
-from tensorflow_addons.losses.crf_loss import ConditionalRandomFieldLoss
-from tensorflow_addons.metrics.crf_accuracy import ConditionalRandomFieldAccuracy
-from tensorflow_addons.utils import test_utils
-
-
-@test_utils.run_all_in_graph_and_eager_modes
-class ConditionalRandomFieldAccuracyTest(tf.test.TestCase):
-    def setUp(self):
-        super(ConditionalRandomFieldAccuracyTest, self).setUp()
-
-        self.logits = np.array([
-            [[0, 0, 0.5, 0.5, 0.2], [0, 0, 0.3, 0.3, 0.1], [0, 0, 0.9, 10, 1]],
-            [[0, 0, 0.2, 0.5, 0.2], [0, 0, 3, 0.3, 0.1], [0, 0, 0.9, 1, 1]],
-        ])
-        self.tags = np.array([[2, 3, 4], [3, 2, 2]])
-
-        self.transitions = np.array([
-            [0.1, 0.2, 0.3, 0.4, 0.5],
-            [0.8, 0.3, 0.1, 0.7, 0.9],
-            [-0.3, 2.1, -5.6, 3.4, 4.0],
-            [0.2, 0.4, 0.6, -0.3, -0.4],
-            [1.0, 1.0, 1.0, 1.0, 1.0],
-        ])
-
-        self.boundary_values = np.ones((5, ))
-
-        # Use the CRF Module with fixed transitions to compute the log_likelihood
-        self.crf = CRF(
-            units=5,
-            use_kernel=False,  # disable kernel transform
-            chain_initializer=tf.keras.initializers.Constant(self.transitions),
-            use_boundary=True,
-            boundary_initializer=tf.keras.initializers.Constant(
-                self.boundary_values),
-            name="crf_layer",
-        )
-
-    def test_model_fit(self):
-        model = tf.keras.models.Sequential()
-        model.add(tf.keras.layers.Input(shape=(3, 5)))
-        model.add(self.crf)
-        model.compile(
-            "adam",
-            loss={"crf_layer": ConditionalRandomFieldLoss()},
-            metrics=[ConditionalRandomFieldAccuracy()])
-
-        model.fit(self.logits, self.tags, epochs=10, batch_size=1)
-
-    def test_dump_and_load(self):
-        MODEL_PERSISTENCE_PATH = './test_saving_crf_model.h5'
-
-        model = tf.keras.models.Sequential()
-        model.add(tf.keras.layers.Input(shape=(3, 5)))
-        model.add(self.crf)
-        model.compile(
-            "adam",
-            loss={"crf_layer": ConditionalRandomFieldLoss()},
-            metrics=[ConditionalRandomFieldAccuracy()])
-
-        model.fit(self.logits, self.tags, epochs=10, batch_size=1)
-
-        model.save(MODEL_PERSISTENCE_PATH)
-        new_model = tf.keras.models.load_model(MODEL_PERSISTENCE_PATH)
-
-        new_model.fit(self.logits, self.tags, epochs=10, batch_size=1)
-
-        tf.keras.models.load_model(MODEL_PERSISTENCE_PATH)
-
-        try:
-            os.remove(MODEL_PERSISTENCE_PATH)
-        except OSError:
-            pass
-
-
-if __name__ == "__main__":
-    tf.test.main()

From d8d2157874d5c074275da30d1ed9f3bf433346b1 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Fri, 11 Oct 2019 20:52:57 +0800
Subject: [PATCH 22/52] Update

---
 tensorflow_addons/losses/crf_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_addons/losses/crf_loss.py b/tensorflow_addons/losses/crf_loss.py
index 51cf9d9e60..9e119922e6 100644
--- a/tensorflow_addons/losses/crf_loss.py
+++ b/tensorflow_addons/losses/crf_loss.py
@@ -20,7 +20,7 @@
 
 import tensorflow as tf
 
-from tensorflow_addons.layers import CRF
+from tensorflow_addons.layers.crf import CRF
 from tensorflow_addons.utils import keras_utils
 
 
From 6ca42158616db6687c196ca78bedff7027568081 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Mon, 14 Oct 2019 10:42:22 +0800
Subject: [PATCH 23/52] Add left padding detect function but not test due to
 building failed

---
 tensorflow_addons/layers/crf.py | 51 +++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index d5bd5c48b3..df1c204586 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -101,6 +101,7 @@ class CRF(tf.keras.layers.Layer):
     References:
         - [Conditional Random Field](https://en.wikipedia.org/wiki/Conditional_random_field)
     """
+
     def __init__(self,
                  units,
                  chain_initializer="orthogonal",
@@ -198,7 +199,7 @@ def build(self, input_shape):
         # bias that works with self.kernel
         if self.use_kernel and self.use_bias:
             self.bias = self.add_weight(
-                shape=(self.units,),
+                shape=(self.units, ),
                 name="bias",
                 initializer=self.bias_initializer,
                 regularizer=self.bias_regularizer,
@@ -210,14 +211,14 @@ def build(self, input_shape):
         # weight of <START> to tag probability and tag to <END> probability
         if self.use_boundary:
             self.left_boundary = self.add_weight(
-                shape=(self.units,),
+                shape=(self.units, ),
                 name="left_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
                 constraint=self.boundary_constraint,
             )
             self.right_boundary = self.add_weight(
-                shape=(self.units,),
+                shape=(self.units, ),
                 name="right_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
@@ -236,30 +237,35 @@ def call(self, inputs, mask=None, **kwargs):
 
         # left padding of mask is not supported, due the underline CRF function
         # detect it and report it to user
-        # if mask is not None:
-        #     left_boundary_mask = self._compute_mask_left_boundary(mask)
-        #     first_mask = left_boundary_mask[0]
-        #
-        #     if not first_mask:
-        #         raise ValueError("Currently, CRF layer don't support left padding")
+        first_mask = None
+        if mask is not None:
+            left_boundary_mask = self._compute_mask_left_boundary(mask)
+            first_mask = left_boundary_mask[0]
 
         # remember this value for later use
         self.mask = mask
 
-        self.potentials = self._dense_layer(inputs)
+        if first_mask is not None:
+            with tf.control_dependencies([
+                    tf.debugging.assert_equal(
+                        first_mask,
+                        tf.constant(1),
+                        message=
+                        "Currently, CRF layer don't support left padding")
+            ]):
+                self.potentials = self._dense_layer(inputs)
+        else:
+            self.potentials = self._dense_layer(inputs)
 
         # appending boundary probability info
         if self.use_boundary:
             self.potentials = self.add_boundary_energy(
-                self.potentials,
-                mask,
-                self.left_boundary,
-                self.right_boundary
-            )
+                self.potentials, mask, self.left_boundary, self.right_boundary)
 
         self.sequence_length = self._get_sequence_length(inputs, mask)
 
-        decoded_sequence, _ = self.get_viterbi_decoding(self.potentials, self.sequence_length)
+        decoded_sequence, _ = self.get_viterbi_decoding(
+            self.potentials, self.sequence_length)
 
         return decoded_sequence
 
@@ -288,7 +294,8 @@ def mask_to_sequence_length(self, mask):
         """
         compute sequence length from mask
         """
-        sequence_length = tf.keras.backend.cast(tf.keras.backend.sum(mask, 1), tf.int64)
+        sequence_length = tf.keras.backend.cast(
+            tf.keras.backend.sum(mask, 1), tf.int64)
         return sequence_length
 
     @staticmethod
@@ -366,7 +373,8 @@ def expend_scalar_to_3d(x):
 
     def get_viterbi_decoding(self, potentials, sequence_length):
         # decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`
-        decode_tags, best_score = crf_decode(potentials, self.chain_kernel, sequence_length)
+        decode_tags, best_score = crf_decode(potentials, self.chain_kernel,
+                                             sequence_length)
 
         return decode_tags, best_score
 
@@ -434,12 +442,13 @@ def get_negative_log_likelihood(self, y_true):
         # TODO: remove typing cast
         self.potentials = tf.keras.backend.cast(self.potentials, tf.float32)
         y_true = tf.keras.backend.cast(y_true, tf.int32)
-        self.sequence_length = tf.keras.backend.cast(self.sequence_length, tf.int32)
+        self.sequence_length = tf.keras.backend.cast(self.sequence_length,
+                                                     tf.int32)
         # self.chain_kernel = tf.keras.backend.cast(self.chain_kernel,
         #                                           tf.float32)
 
-        log_likelihood, _ = crf_log_likelihood(self.potentials, y_true, self.sequence_length,
-                                               self.chain_kernel)
+        log_likelihood, _ = crf_log_likelihood(
+            self.potentials, y_true, self.sequence_length, self.chain_kernel)
 
         return -log_likelihood
 

From e6b48e151cf052b807e53980b95926a4bbff5659 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Mon, 14 Oct 2019 17:06:33 +0800
Subject: [PATCH 24/52] Update: passed all the test

---
 tensorflow_addons/layers/crf.py           | 10 +--
 tensorflow_addons/layers/crf_test.py      | 57 +++++++---------
 tensorflow_addons/losses/BUILD            |  1 +
 tensorflow_addons/losses/crf_loss_test.py | 80 +++++++++++++++++++++--
 4 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index df1c204586..3ac5c8684b 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -240,7 +240,7 @@ def call(self, inputs, mask=None, **kwargs):
         first_mask = None
         if mask is not None:
             left_boundary_mask = self._compute_mask_left_boundary(mask)
-            first_mask = left_boundary_mask[0]
+            first_mask = left_boundary_mask[:, 0]
 
         # remember this value for later use
         self.mask = mask
@@ -248,10 +248,10 @@ def call(self, inputs, mask=None, **kwargs):
         if first_mask is not None:
             with tf.control_dependencies([
                     tf.debugging.assert_equal(
-                        first_mask,
-                        tf.constant(1),
-                        message=
-                        "Currently, CRF layer don't support left padding")
+                        tf.math.reduce_all(first_mask),
+                        tf.constant(True),
+                        message="Currently, CRF layer do not support left padding"
+                    )
             ]):
                 self.potentials = self._dense_layer(inputs)
         else:
diff --git a/tensorflow_addons/layers/crf_test.py b/tensorflow_addons/layers/crf_test.py
index 703add0b15..c9b9af761e 100644
--- a/tensorflow_addons/layers/crf_test.py
+++ b/tensorflow_addons/layers/crf_test.py
@@ -26,32 +26,27 @@
 
 @test_utils.run_all_in_graph_and_eager_modes
 class TestCRF(tf.test.TestCase):
-    # def test_left_padding(self):
-    #     test_utils.layer_test(
-    #         CRF,
-    #         kwargs={"units": 4, "mask": np.array([0, 1, 1])},
-    #         validate_training=False)
-
     def test_unmasked_viterbi_decode(self):
-        x = np.array([
-            [
-                # O   B-X  I-X  B-Y  I-Y
-                [0.0, 1.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 1.0, 0.0, 0.0],
-                [0.0, 0.0, 1.0, 0.0, 0.0]
-            ],
+        x = np.array(
             [
-                # O   B-X  I-X  B-Y  I-Y
-                [0.0, 1.0, 0.0, 0.0, 0.0],
-                [0.0, 1.0, 0.0, 0.0, 0.0],
-                [0.0, 1.0, 0.0, 0.0, 0.0]
+                [
+                    # O   B-X  I-X  B-Y  I-Y
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.0, 0.0],
+                ],
+                [
+                    # O   B-X  I-X  B-Y  I-Y
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                ],
             ]
-        ])  # yapf: disable
+        )  # yapf: disable
 
-        expected_y = np.array([
-            [1, 2, 2],  # B-X  I-X  I-X
-            [1, 1, 1]   # B-X  B-X  B-X
-        ])  # yapf: disable
+        expected_y = np.array(
+            [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
+        )  # yapf: disable
 
         transitions = np.ones([5, 5])
         boundary_value = np.ones(5)
@@ -59,21 +54,17 @@ def test_unmasked_viterbi_decode(self):
         test_utils.layer_test(
             CRF,
             kwargs={
-                "units":
-                5,
-                "use_kernel":
-                False,  # disable kernel transform
-                "chain_initializer":
-                tf.keras.initializers.Constant(transitions),
-                "use_boundary":
-                True,
-                "boundary_initializer":
-                tf.keras.initializers.Constant(boundary_value)
+                "units": 5,
+                "use_kernel": False,  # disable kernel transform
+                "chain_initializer": tf.keras.initializers.Constant(transitions),
+                "use_boundary": True,
+                "boundary_initializer": tf.keras.initializers.Constant(boundary_value),
             },
             input_data=x,
             expected_output=expected_y,
             expected_output_dtype=tf.int32,
-            validate_training=False)
+            validate_training=False,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow_addons/losses/BUILD b/tensorflow_addons/losses/BUILD
index 46f7019fb9..e564e595ee 100644
--- a/tensorflow_addons/losses/BUILD
+++ b/tensorflow_addons/losses/BUILD
@@ -18,6 +18,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow_addons/activations",
+        "//tensorflow_addons/layers",
         "//tensorflow_addons/utils",
     ],
 )
diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_loss_test.py
index 9b34061af1..f6fcaa3e33 100644
--- a/tensorflow_addons/losses/crf_loss_test.py
+++ b/tensorflow_addons/losses/crf_loss_test.py
@@ -26,7 +26,7 @@
 import tensorflow as tf
 
 from tensorflow_addons.layers.crf import CRF
-from tensorflow_addons.losses.crf_loss import ConditionalRandomFieldLoss
+from tensorflow_addons.losses import crf_loss
 from tensorflow_addons.utils import test_utils
 
 # TODO(howl-anderson):  test CRF as the first layer
@@ -106,7 +106,7 @@ def test_loss_function(self):
         model.add(self.crf)
         model.compile(
             "adam",
-            loss={"crf_layer": ConditionalRandomFieldLoss()},
+            loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
             metrics=[tf.keras.metrics.Accuracy()])
 
         log_likelihood, _ = model.train_on_batch(self.logits, self.tags)
@@ -124,7 +124,7 @@ def test_model_fit(self):
         model.add(self.crf)
         model.compile(
             "adam",
-            loss={"crf_layer": ConditionalRandomFieldLoss()},
+            loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
             metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
@@ -137,7 +137,7 @@ def test_dump_and_load(self):
         model.add(self.crf)
         model.compile(
             "adam",
-            loss={"crf_layer": ConditionalRandomFieldLoss()},
+            loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
             metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
@@ -154,6 +154,78 @@ def test_dump_and_load(self):
         except OSError:
             pass
 
+    def test_mask_left_padding(self):
+        train_x = np.array(
+            [
+                [
+                    # O   B-X  I-X  B-Y  I-Y
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.0, 0.0],
+                ],
+                [
+                    # O   B-X  I-X  B-Y  I-Y
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                ],
+            ]
+        )  # yapf: disable
+
+        train_y = np.array(
+            [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
+        )  # yapf: disable
+
+        mask = np.array([[0, 1, 1], [1, 1, 1]])
+
+        layer = CRF(5)
+
+        x = tf.keras.layers.Input(shape=(3, 5))
+        y = layer(x, mask=tf.constant(mask))
+
+        # check shape inference
+        model = tf.keras.models.Model(x, y)
+        model.compile('adam', crf_loss.ConditionalRandomFieldLoss())
+
+        with self.assertRaises(tf.errors.InvalidArgumentError) as context:
+            model.fit(train_x, train_y)
+
+        self.assertTrue("CRF layer do not support left padding" in context.exception.message)
+
+    def test_mask_right_padding(self):
+        train_x = np.array(
+            [
+                [
+                    # O   B-X  I-X  B-Y  I-Y
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.0, 0.0],
+                ],
+                [
+                    # O   B-X  I-X  B-Y  I-Y
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                ],
+            ]
+        )  # yapf: disable
+
+        train_y = np.array(
+            [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
+        )  # yapf: disable
+
+        mask = np.array([[1, 1, 1], [1, 1, 0]])
+
+        layer = CRF(5)
+
+        x = tf.keras.layers.Input(shape=(3, 5))
+        y = layer(x, mask=tf.constant(mask))
+
+        # check shape inference
+        model = tf.keras.models.Model(x, y)
+        model.compile('adam', crf_loss.ConditionalRandomFieldLoss())
+        model.fit(train_x, train_y)
+
 
 if __name__ == "__main__":
     tf.test.main()

From 6f6fe9e582f2090efab4646a883674ec5aef5f38 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Mon, 14 Oct 2019 17:22:27 +0800
Subject: [PATCH 25/52] Update: tiny improvement

---
 tensorflow_addons/layers/README.md | 4 ++--
 tensorflow_addons/losses/README.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow_addons/layers/README.md b/tensorflow_addons/layers/README.md
index 6ef28d29d6..216e356303 100644
--- a/tensorflow_addons/layers/README.md
+++ b/tensorflow_addons/layers/README.md
@@ -3,6 +3,7 @@
 ## Maintainers
 | Submodule  |  Maintainers  | Contact Info   |
 |:---------- |:----------- |:------------- |
+| crf | @howl-anderson | u1mail2me@gmail.com |
 | gelu       | @AakashKumarNain | aakashnain@outlook.com |  
 | maxout | @fsx950223 | fsx950223@gmail.com |
 | normalizations | @smokrow | moritz.kroeger@tu-dortmund.de |
@@ -10,11 +11,11 @@
 | poincare |  |  |
 | sparsemax | @AndreasMadsen | amwwebdk+github@gmail.com |
 | wrappers | @seanpmorgan | seanmorgan@outlook.com |
-| crf | @howl-anderson | u1mail2me@gmail.com |
 
 ## Components
 | Submodule  | Layer |  Reference  |
 |:---------- |:----------- |:------------- |
+| crf | CRF | https://en.wikipedia.org/wiki/Conditional_random_field |
 | gelu   | GeLU   | https://arxiv.org/abs/1606.08415 | 
 | maxout | Maxout | https://arxiv.org/abs/1302.4389    |
 | normalizations | GroupNormalization | https://arxiv.org/abs/1803.08494 |
@@ -23,7 +24,6 @@
 | poincare | PoincareNormalize | https://arxiv.org/abs/1705.08039    |
 | sparsemax| Sparsemax | https://arxiv.org/abs/1602.02068 |
 | wrappers | WeightNormalization | https://arxiv.org/abs/1602.07868 |
-| crf | CRF | https://en.wikipedia.org/wiki/Conditional_random_field |
 
 ## Contribution Guidelines
 #### Standard API
diff --git a/tensorflow_addons/losses/README.md b/tensorflow_addons/losses/README.md
index 23ba8fafbf..876c0f3e86 100644
--- a/tensorflow_addons/losses/README.md
+++ b/tensorflow_addons/losses/README.md
@@ -4,25 +4,25 @@
 | Submodule  |  Maintainers  | Contact Info   |
 |:---------- |:----------- |:------------- |
 | contrastive |  @WindQAQ | windqaq@gmail.com |
+| crf | @howl-anderson | u1mail2me@gmail.com |
 | focal_loss | @SSaishruthi  | saishruthi.tn@gmail.com |
 | lifted |  |  |
 | npairs | @WindQAQ | windqaq@gmail.com |
 | sparsemax_loss | @AndreasMadsen | amwwebdk+github@gmail.com |
 | triplet |   |  |
-| crf | @howl-anderson | u1mail2me@gmail.com |
 
 
 ## Components
 | Submodule | Loss  | Reference               |
 |:----------------------- |:---------------------|:--------------------------|
 | contrastive | ContrastiveLoss | http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf |
+| crf_loss | ConditionalRandomFieldLoss | https://en.wikipedia.org/wiki/Conditional_random_field |
 | focal_loss | SigmoidFocalCrossEntropy | https://arxiv.org/abs/1708.02002  |
 | lifted | LiftedStructLoss | https://arxiv.org/abs/1511.06452       |
 | npairs | NpairsLoss | http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf |
 | npairs | NpairsMultilabelLoss | http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf |
 | sparsemax_loss | SparsemaxLoss |  https://arxiv.org/abs/1602.02068 |
 | triplet | TripletSemiHardLoss | https://arxiv.org/abs/1503.03832       |
-| crf_loss | ConditionalRandomFieldLoss | https://en.wikipedia.org/wiki/Conditional_random_field |
 
 
 ## Contribution Guidelines

From 68a61324f333286beace4480bd2afeb53179e713 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 15 Oct 2019 14:25:13 +0800
Subject: [PATCH 26/52] Tiny improvement according to github user @OttoseiAza

---
 tensorflow_addons/layers/crf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 3ac5c8684b..65bb459658 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -278,8 +278,7 @@ def _get_sequence_length(self, input_, mask):
         this function is compute the sequence length from input and mask.
         """
         if mask is not None:
-            int_mask = tf.keras.backend.cast(mask, tf.int8)
-            sequence_length = self.mask_to_sequence_length(int_mask)
+            sequence_length = self.mask_to_sequence_length(mask)
         else:
             # make a mask tensor from input, then used to generate sequence_length
             input_energy_shape = tf.shape(input_)

From 75603b31195ad139af4c457552e7483f9b433016 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 15 Oct 2019 14:25:45 +0800
Subject: [PATCH 27/52] Add more metrics in test

---
 tensorflow_addons/losses/crf_loss_test.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_loss_test.py
index f6fcaa3e33..4318c086a0 100644
--- a/tensorflow_addons/losses/crf_loss_test.py
+++ b/tensorflow_addons/losses/crf_loss_test.py
@@ -107,7 +107,11 @@ def test_loss_function(self):
         model.compile(
             "adam",
             loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
-            metrics=[tf.keras.metrics.Accuracy()])
+            metrics=[
+                tf.keras.metrics.Accuracy(),
+                tf.keras.metrics.Precision(),
+                tf.keras.metrics.Recall()
+            ])
 
         log_likelihood, _ = model.train_on_batch(self.logits, self.tags)
 
@@ -125,7 +129,11 @@ def test_model_fit(self):
         model.compile(
             "adam",
             loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
-            metrics=[tf.keras.metrics.Accuracy()])
+            metrics=[
+                tf.keras.metrics.Accuracy(),
+                tf.keras.metrics.Precision(),
+                tf.keras.metrics.Recall()
+            ])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
@@ -138,7 +146,11 @@ def test_dump_and_load(self):
         model.compile(
             "adam",
             loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
-            metrics=[tf.keras.metrics.Accuracy()])
+            metrics=[
+                tf.keras.metrics.Accuracy(),
+                tf.keras.metrics.Precision(),
+                tf.keras.metrics.Recall()
+            ])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
@@ -190,7 +202,8 @@ def test_mask_left_padding(self):
         with self.assertRaises(tf.errors.InvalidArgumentError) as context:
             model.fit(train_x, train_y)
 
-        self.assertTrue("CRF layer do not support left padding" in context.exception.message)
+        self.assertTrue("CRF layer do not support left padding" in
+                        context.exception.message)
 
     def test_mask_right_padding(self):
         train_x = np.array(

From 5960add8aad3ae8eb77e619b8c4622611f2e6c6d Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 15 Oct 2019 17:48:38 +0800
Subject: [PATCH 28/52] fix code style

---
 tensorflow_addons/layers/crf.py           | 41 ++++++++++-------------
 tensorflow_addons/layers/crf_test.py      | 15 ++++++---
 tensorflow_addons/losses/crf_loss_test.py |  2 +-
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 65bb459658..9ead48c800 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -199,7 +199,7 @@ def build(self, input_shape):
         # bias that works with self.kernel
         if self.use_kernel and self.use_bias:
             self.bias = self.add_weight(
-                shape=(self.units, ),
+                shape=(self.units,),
                 name="bias",
                 initializer=self.bias_initializer,
                 regularizer=self.bias_regularizer,
@@ -211,14 +211,14 @@ def build(self, input_shape):
         # weight of <START> to tag probability and tag to <END> probability
         if self.use_boundary:
             self.left_boundary = self.add_weight(
-                shape=(self.units, ),
+                shape=(self.units,),
                 name="left_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
                 constraint=self.boundary_constraint,
             )
             self.right_boundary = self.add_weight(
-                shape=(self.units, ),
+                shape=(self.units,),
                 name="right_boundary",
                 initializer=self.boundary_initializer,
                 regularizer=self.boundary_regularizer,
@@ -232,8 +232,9 @@ def call(self, inputs, mask=None, **kwargs):
         # mask: Tensor(shape=(batch_size, sequence_length), dtype=bool) or None
 
         if mask is not None:
-            assert (tf.keras.backend.ndim(mask) == 2
-                    ), "Input mask to CRF must have dim 2 if not None"
+            if tf.keras.backend.ndim(mask) != 2:
+                raise ValueError(
+                    "Input mask to CRF must have dim 2 if not None")
 
         # left padding of mask is not supported, due the underline CRF function
         # detect it and report it to user
@@ -246,12 +247,11 @@ def call(self, inputs, mask=None, **kwargs):
         self.mask = mask
 
         if first_mask is not None:
+            no_left_padding = tf.math.reduce_all(first_mask)
+            msg = "Currently, CRF layer do not support left padding"
             with tf.control_dependencies([
                     tf.debugging.assert_equal(
-                        tf.math.reduce_all(first_mask),
-                        tf.constant(True),
-                        message="Currently, CRF layer do not support left padding"
-                    )
+                        no_left_padding, tf.constant(True), message=msg)
             ]):
                 self.potentials = self._dense_layer(inputs)
         else:
@@ -270,12 +270,13 @@ def call(self, inputs, mask=None, **kwargs):
         return decoded_sequence
 
     def _get_sequence_length(self, input_, mask):
-        """
-        Currently underline CRF fucntion (provided by tensorflow_addons.text.crf)
-        do not support bi-direction masking (left padding / right padding),
-        it support right padding by tell it the sequence length.
+        """Currently underline CRF fucntion (provided by
+        tensorflow_addons.text.crf) do not support bi-direction masking (left
+        padding / right padding), it support right padding by tell it the
+        sequence length.
 
-        this function is compute the sequence length from input and mask.
+        this function is compute the sequence length from input and
+        mask.
         """
         if mask is not None:
             sequence_length = self.mask_to_sequence_length(mask)
@@ -290,18 +291,14 @@ def _get_sequence_length(self, input_, mask):
         return sequence_length
 
     def mask_to_sequence_length(self, mask):
-        """
-        compute sequence length from mask
-        """
+        """compute sequence length from mask."""
         sequence_length = tf.keras.backend.cast(
             tf.keras.backend.sum(mask, 1), tf.int64)
         return sequence_length
 
     @staticmethod
     def _compute_mask_right_boundary(mask):
-        """
-        input mask: 0011100, output left_boundary: 0000100
-        """
+        """input mask: 0011100, output left_boundary: 0000100."""
         # shift mask to left by 1: 0011100 => 0111000
         offset = 1
         left_shifted_mask = tf.keras.backend.concatenate(
@@ -327,9 +324,7 @@ def _compute_mask_right_boundary(mask):
 
     @staticmethod
     def _compute_mask_left_boundary(mask):
-        """
-        input mask: 0011100, output left_boundary: 0010000
-        """
+        """input mask: 0011100, output left_boundary: 0010000."""
         # shift mask to right by 1: 0011100 => 0001110
         offset = 1
         right_shifted_mask = tf.keras.backend.concatenate(
diff --git a/tensorflow_addons/layers/crf_test.py b/tensorflow_addons/layers/crf_test.py
index c9b9af761e..12af77b74d 100644
--- a/tensorflow_addons/layers/crf_test.py
+++ b/tensorflow_addons/layers/crf_test.py
@@ -54,11 +54,16 @@ def test_unmasked_viterbi_decode(self):
         test_utils.layer_test(
             CRF,
             kwargs={
-                "units": 5,
-                "use_kernel": False,  # disable kernel transform
-                "chain_initializer": tf.keras.initializers.Constant(transitions),
-                "use_boundary": True,
-                "boundary_initializer": tf.keras.initializers.Constant(boundary_value),
+                "units":
+                5,
+                "use_kernel":
+                False,  # disable kernel transform
+                "chain_initializer":
+                tf.keras.initializers.Constant(transitions),
+                "use_boundary":
+                True,
+                "boundary_initializer":
+                tf.keras.initializers.Constant(boundary_value),
             },
             input_data=x,
             expected_output=expected_y,
diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_loss_test.py
index 4318c086a0..15c5d475fc 100644
--- a/tensorflow_addons/losses/crf_loss_test.py
+++ b/tensorflow_addons/losses/crf_loss_test.py
@@ -51,7 +51,7 @@ def setUp(self):
             [1.0, 1.0, 1.0, 1.0, 1.0],
         ])
 
-        self.boundary_values = np.ones((5, ))
+        self.boundary_values = np.ones((5,))
 
         # Use the CRF Module with fixed transitions to compute the log_likelihood
         self.crf = CRF(

From 778160f4b938a2eac8c8d6d8de595effb05cb58f Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 16 Oct 2019 14:50:53 +0800
Subject: [PATCH 29/52] bugfix: Precision and Recall don't work on CRF for now

---
 tensorflow_addons/losses/crf_loss_test.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_loss_test.py
index 15c5d475fc..d2b24fa243 100644
--- a/tensorflow_addons/losses/crf_loss_test.py
+++ b/tensorflow_addons/losses/crf_loss_test.py
@@ -51,7 +51,7 @@ def setUp(self):
             [1.0, 1.0, 1.0, 1.0, 1.0],
         ])
 
-        self.boundary_values = np.ones((5,))
+        self.boundary_values = np.ones((5, ))
 
         # Use the CRF Module with fixed transitions to compute the log_likelihood
         self.crf = CRF(
@@ -107,11 +107,7 @@ def test_loss_function(self):
         model.compile(
             "adam",
             loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
-            metrics=[
-                tf.keras.metrics.Accuracy(),
-                tf.keras.metrics.Precision(),
-                tf.keras.metrics.Recall()
-            ])
+            metrics=[tf.keras.metrics.Accuracy()])
 
         log_likelihood, _ = model.train_on_batch(self.logits, self.tags)
 
@@ -129,11 +125,7 @@ def test_model_fit(self):
         model.compile(
             "adam",
             loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
-            metrics=[
-                tf.keras.metrics.Accuracy(),
-                tf.keras.metrics.Precision(),
-                tf.keras.metrics.Recall()
-            ])
+            metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
@@ -146,11 +138,7 @@ def test_dump_and_load(self):
         model.compile(
             "adam",
             loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
-            metrics=[
-                tf.keras.metrics.Accuracy(),
-                tf.keras.metrics.Precision(),
-                tf.keras.metrics.Recall()
-            ])
+            metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 

From 8015ea84b1bb844942b4b579c034ef8bfa7e98e6 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 16 Oct 2019 17:26:19 +0800
Subject: [PATCH 30/52] Add test_in_subclass_model test case which need a patch
 of tensorflow core

---
 tensorflow_addons/losses/crf_loss_test.py | 87 +++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_loss_test.py
index d2b24fa243..81d835d477 100644
--- a/tensorflow_addons/losses/crf_loss_test.py
+++ b/tensorflow_addons/losses/crf_loss_test.py
@@ -24,10 +24,18 @@
 
 import numpy as np
 import tensorflow as tf
+import six
 
 from tensorflow_addons.layers.crf import CRF
 from tensorflow_addons.losses import crf_loss
 from tensorflow_addons.utils import test_utils
+from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.framework import tensor_util
+if six.PY3:
+    from unittest.mock import patch
+else:
+    from mock import patch
+from tensorflow.python.util import nest
 
 # TODO(howl-anderson):  test CRF as the first layer
 
@@ -227,6 +235,85 @@ def test_mask_right_padding(self):
         model.compile('adam', crf_loss.ConditionalRandomFieldLoss())
         model.fit(train_x, train_y)
 
+    def test_in_subclass_model(self):
+        train_x = np.array(
+            [
+                [
+                    # O   B-X  I-X  B-Y  I-Y
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.0, 0.0],
+                ],
+                [
+                    # O   B-X  I-X  B-Y  I-Y
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0, 0.0],
+                ],
+            ]
+        )  # yapf: disable
+
+        train_y = np.array(
+            [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
+        )  # yapf: disable
+
+        def patch_mark_as_return(outputs, acd):
+            """Marks `outputs` as the return values for automatic control deps."""
+
+            def _mark_as_return(tensor):
+                """Marks `tensor` as the return value for automatic control deps."""
+                if not tensor_util.is_tensor(tensor):
+                    return tensor
+
+                # pylint: disable=protected-access
+                return_tensor = acd.mark_as_return(tensor)
+                if getattr(tensor, '_keras_mask', None) is not None:
+                    return_tensor._keras_mask = acd.mark_as_return(
+                        tensor._keras_mask)
+                else:
+                    return_tensor._keras_mask = None
+
+                # TODO(howl-anderson) a little hack here, handle _keras_history
+                if getattr(tensor, '_keras_history', None) is not None:
+                    return_tensor._keras_history = tensor._keras_history
+
+                # Handle TensorFlow Probability attached metadata.
+                # TODO(b/132076537): Remove this once TFP uses `CompositeTensor`.
+                if getattr(tensor, '_tfp_distribution', None) is not None:
+                    return_tensor._tfp_distribution = tensor._tfp_distribution
+
+                return return_tensor
+                # pylint: enable=protected-access
+
+            return nest.map_structure(_mark_as_return, outputs)
+
+        class CRFModel(tf.keras.Model):
+            def __init__(self):
+                super(CRFModel, self).__init__()
+
+                self.layer = CRF(5)
+
+            def call(self, inputs):
+                return self.layer(inputs)
+
+            @patch.object(base_layer_utils, 'mark_as_return',
+                          patch_mark_as_return)
+            def __call__(self, inputs, *args, **kwargs):
+                outputs = super(CRFModel, self).__call__(
+                    inputs, *args, **kwargs)
+
+                # A hack that add _keras_history to EagerTensor, make it more like normal Tensor
+                for tensor in tf.nest.flatten(outputs):
+                    if not hasattr(tensor, '_keras_history'):
+                        tensor._keras_history = (self, 0, 0)
+
+                return outputs
+
+        model = CRFModel()
+
+        model.compile('adam', crf_loss.ConditionalRandomFieldLoss())
+        model.fit(train_x, train_y)
+
 
 if __name__ == "__main__":
     tf.test.main()

From 674267ff9cac7ab3f108d78b830660a2aeed4376 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Thu, 17 Oct 2019 13:54:59 +0800
Subject: [PATCH 31/52] Update comment

---
 tensorflow_addons/layers/crf.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 9ead48c800..e4997f4e01 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -418,22 +418,11 @@ def compute_output_shape(self, input_shape):
         return output_shape
 
     def compute_mask(self, input_, mask=None):
-        # """
-        # Set output mask to be 1D tensor, so loss method of this class can work without error.
-        # But there is big short come:
-        # layer, loss and metrics after this layer
-        # can not access meaningful mask. Which mean they can not work correctly.
-        # User only can get correct loss and metrics value from methods of this layer.
-        # """
-        # if mask is not None:
-        #     # transform mask from shape (?, ?) to (?, )
-        #     new_mask = tf.keras.backend.any(mask, axis=1)
-        #     return new_mask
-
+        """ keep mask shape [batch_size, max_seq_len] """
         return mask
 
     def get_negative_log_likelihood(self, y_true):
-        # TODO: remove typing cast
+        # TODO(howl-anderson): remove unnecessary typing cast
         self.potentials = tf.keras.backend.cast(self.potentials, tf.float32)
         y_true = tf.keras.backend.cast(y_true, tf.int32)
         self.sequence_length = tf.keras.backend.cast(self.sequence_length,

From e23e19cb79449cd723ca1f842f79fe838d8faf3c Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Mon, 23 Dec 2019 17:41:49 +0800
Subject: [PATCH 32/52] bugfix and docfix

---
 tensorflow_addons/layers/crf.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index e4997f4e01..0fb4fb58a2 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -22,10 +22,9 @@
 
 import tensorflow as tf
 from tensorflow_addons.text.crf import crf_decode, crf_log_likelihood
-from tensorflow_addons.utils import keras_utils
 
 
-@keras_utils.register_keras_custom_object
+@tf.keras.utils.register_keras_serializable(package='Addons')
 class CRF(tf.keras.layers.Layer):
     """Linear chain conditional random field (CRF).
 
@@ -33,14 +32,15 @@ class CRF(tf.keras.layers.Layer):
 
     ```python
         from tensorflow_addons.layers import CRF
+        from tensorflow_addons.losses import crf_loss
 
         model = Sequential()
         model.add(Embedding(3001, 300, mask_zero=True)
 
-        crf = CRF(10, name='crf_layer')
+        crf = CRF(10)
         model.add(crf)
 
-        model.compile('adam', loss={'crf_layer': crf.loss})
+        model.compile('adam', loss=crf_loss)
 
         model.fit(x, y)
     ```
@@ -293,7 +293,8 @@ def _get_sequence_length(self, input_, mask):
     def mask_to_sequence_length(self, mask):
         """compute sequence length from mask."""
         sequence_length = tf.keras.backend.cast(
-            tf.keras.backend.sum(mask, 1), tf.int64)
+            tf.keras.backend.sum(tf.keras.backend.cast(mask, tf.int8), 1),
+            tf.int64)
         return sequence_length
 
     @staticmethod
@@ -332,7 +333,10 @@ def _compute_mask_left_boundary(mask):
             axis=1)
 
         # 0011100 > 0001110 => 0010000
-        left_boundary = tf.keras.backend.greater(mask, right_shifted_mask)
+        left_boundary = tf.keras.backend.greater(
+            tf.dtypes.cast(mask, tf.int32),
+            tf.dtypes.cast(right_shifted_mask, tf.int32))
+        # left_boundary = tf.keras.backend.greater(mask, right_shifted_mask)
 
         return left_boundary
 
@@ -418,7 +422,7 @@ def compute_output_shape(self, input_shape):
         return output_shape
 
     def compute_mask(self, input_, mask=None):
-        """ keep mask shape [batch_size, max_seq_len] """
+        """keep mask shape [batch_size, max_seq_len]"""
         return mask
 
     def get_negative_log_likelihood(self, y_true):

From b262357ba32c8c6b14bfb14a0914a7e69661736e Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Mon, 23 Dec 2019 17:47:44 +0800
Subject: [PATCH 33/52] update

---
 tensorflow_addons/losses/BUILD                |  8 ++---
 tensorflow_addons/losses/__init__.py          |  2 +-
 .../losses/{crf_loss.py => crf.py}            | 32 ++++++++++++++---
 .../losses/{crf_loss_test.py => crf_test.py}  | 36 +++++++++++++------
 4 files changed, 59 insertions(+), 19 deletions(-)
 rename tensorflow_addons/losses/{crf_loss.py => crf.py} (63%)
 rename tensorflow_addons/losses/{crf_loss_test.py => crf_test.py} (91%)

diff --git a/tensorflow_addons/losses/BUILD b/tensorflow_addons/losses/BUILD
index 731c5ad7bd..04c41e812c 100644
--- a/tensorflow_addons/losses/BUILD
+++ b/tensorflow_addons/losses/BUILD
@@ -7,7 +7,7 @@ py_library(
     srcs = [
         "__init__.py",
         "contrastive.py",
-        "crf_loss.py",
+        "crf.py",
         "focal_loss.py",
         "giou_loss.py",
         "lifted.py",
@@ -120,12 +120,12 @@ py_test(
 )
 
 py_test(
-    name = "crf_loss_test",
+    name = "crf_test",
     size = "small",
     srcs = [
-        "crf_loss_test.py",
+        "crf_test.py",
     ],
-    main = "crf_loss_test.py",
+    main = "crf_test.py",
     srcs_version = "PY2AND3",
     deps = [
         ":losses",
diff --git a/tensorflow_addons/losses/__init__.py b/tensorflow_addons/losses/__init__.py
index 67be687997..78caae6fbc 100644
--- a/tensorflow_addons/losses/__init__.py
+++ b/tensorflow_addons/losses/__init__.py
@@ -25,4 +25,4 @@
 from tensorflow_addons.losses.npairs import npairs_loss, NpairsLoss, npairs_multilabel_loss, NpairsMultilabelLoss
 from tensorflow_addons.losses.sparsemax_loss import sparsemax_loss, SparsemaxLoss
 from tensorflow_addons.losses.triplet import triplet_semihard_loss, TripletSemiHardLoss
-from tensorflow_addons.losses.crf_loss import ConditionalRandomFieldLoss
+from tensorflow_addons.losses.crf import crf_loss, ConditionalRandomFieldLoss
diff --git a/tensorflow_addons/losses/crf_loss.py b/tensorflow_addons/losses/crf.py
similarity index 63%
rename from tensorflow_addons/losses/crf_loss.py
rename to tensorflow_addons/losses/crf.py
index 9e119922e6..027b8e0d9e 100644
--- a/tensorflow_addons/losses/crf_loss.py
+++ b/tensorflow_addons/losses/crf.py
@@ -21,22 +21,46 @@
 import tensorflow as tf
 
 from tensorflow_addons.layers.crf import CRF
-from tensorflow_addons.utils import keras_utils
 
 
-@keras_utils.register_keras_custom_object
+@tf.keras.utils.register_keras_serializable(package="Addons")
 class ConditionalRandomFieldLoss(object):
+    def __init__(self, name="crf_loss"):
+        self.name = name
+
     def get_config(self):
-        return {}
+        return {"name": self.name}
 
     def __call__(self, y_true, y_pred, sample_weight=None):
         crf_layer = y_pred._keras_history[0]
 
         # check if last layer is CRF
         if not isinstance(crf_layer, CRF):
-            raise ValueError('Last layer must be CRF for use {}.'.format(
+            raise ValueError("Last layer must be CRF for use {}.".format(
                 self.__class__.__name__))
 
         loss_vector = crf_layer.get_loss(y_true, y_pred)
 
         return tf.keras.backend.mean(loss_vector)
+
+
+@tf.keras.utils.register_keras_serializable(package="Addons")
+def crf_loss(y_true, y_pred):
+    """
+    Args
+        y_true: true targets tensor.
+        y_pred: predictions tensor.
+
+    Returns:
+        scalar.
+    """
+    crf_layer = y_pred._keras_history[0]
+
+    # check if last layer is CRF
+    if not isinstance(crf_layer, CRF):
+        raise ValueError(
+            "Last layer must be CRF for use {}.".format("crf_loss"))
+
+    loss_vector = crf_layer.get_loss(y_true, y_pred)
+
+    return tf.keras.backend.mean(loss_vector)
diff --git a/tensorflow_addons/losses/crf_loss_test.py b/tensorflow_addons/losses/crf_test.py
similarity index 91%
rename from tensorflow_addons/losses/crf_loss_test.py
rename to tensorflow_addons/losses/crf_test.py
index 81d835d477..15339b7b49 100644
--- a/tensorflow_addons/losses/crf_loss_test.py
+++ b/tensorflow_addons/losses/crf_test.py
@@ -27,7 +27,7 @@
 import six
 
 from tensorflow_addons.layers.crf import CRF
-from tensorflow_addons.losses import crf_loss
+from tensorflow_addons.losses import crf
 from tensorflow_addons.utils import test_utils
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.framework import tensor_util
@@ -59,7 +59,7 @@ def setUp(self):
             [1.0, 1.0, 1.0, 1.0, 1.0],
         ])
 
-        self.boundary_values = np.ones((5, ))
+        self.boundary_values = np.ones((5,))
 
         # Use the CRF Module with fixed transitions to compute the log_likelihood
         self.crf = CRF(
@@ -114,7 +114,7 @@ def test_loss_function(self):
         model.add(self.crf)
         model.compile(
             "adam",
-            loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
+            loss=crf.ConditionalRandomFieldLoss(),
             metrics=[tf.keras.metrics.Accuracy()])
 
         log_likelihood, _ = model.train_on_batch(self.logits, self.tags)
@@ -132,12 +132,14 @@ def test_model_fit(self):
         model.add(self.crf)
         model.compile(
             "adam",
-            loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
+            loss=crf.ConditionalRandomFieldLoss(),
             metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
     def test_dump_and_load(self):
+        self.skipTest("don't work now")
+
         MODEL_PERSISTENCE_PATH = './test_saving_crf_model.h5'
 
         model = tf.keras.models.Sequential()
@@ -145,7 +147,7 @@ def test_dump_and_load(self):
         model.add(self.crf)
         model.compile(
             "adam",
-            loss={"crf_layer": crf_loss.ConditionalRandomFieldLoss()},
+            loss=crf.ConditionalRandomFieldLoss(),
             metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
@@ -193,7 +195,7 @@ def test_mask_left_padding(self):
 
         # check shape inference
         model = tf.keras.models.Model(x, y)
-        model.compile('adam', crf_loss.ConditionalRandomFieldLoss())
+        model.compile('adam', crf.ConditionalRandomFieldLoss())
 
         with self.assertRaises(tf.errors.InvalidArgumentError) as context:
             model.fit(train_x, train_y)
@@ -232,7 +234,7 @@ def test_mask_right_padding(self):
 
         # check shape inference
         model = tf.keras.models.Model(x, y)
-        model.compile('adam', crf_loss.ConditionalRandomFieldLoss())
+        model.compile('adam', crf.ConditionalRandomFieldLoss())
         model.fit(train_x, train_y)
 
     def test_in_subclass_model(self):
@@ -258,10 +260,12 @@ def test_in_subclass_model(self):
         )  # yapf: disable
 
         def patch_mark_as_return(outputs, acd):
-            """Marks `outputs` as the return values for automatic control deps."""
+            """Marks `outputs` as the return values for automatic control
+            deps."""
 
             def _mark_as_return(tensor):
-                """Marks `tensor` as the return value for automatic control deps."""
+                """Marks `tensor` as the return value for automatic control
+                deps."""
                 if not tensor_util.is_tensor(tensor):
                     return tensor
 
@@ -311,9 +315,21 @@ def __call__(self, inputs, *args, **kwargs):
 
         model = CRFModel()
 
-        model.compile('adam', crf_loss.ConditionalRandomFieldLoss())
+        model.compile('adam', crf.ConditionalRandomFieldLoss())
         model.fit(train_x, train_y)
 
+    def test_serialization(self, dtype=None):
+        ref_fn = crf.crf_loss
+        config = tf.keras.losses.serialize(ref_fn)
+        fn = tf.keras.losses.deserialize(config)
+        self.assertEqual(ref_fn, fn)
+
+    def test_keras_model_compile(self):
+        model = tf.keras.models.Sequential(
+            [tf.keras.layers.Input(shape=(3, 5)), self.crf])
+
+        model.compile(loss="Addons>crf_loss", optimizer="adam")
+
 
 if __name__ == "__main__":
     tf.test.main()

From 19749e81552f798e309c24e07cc0db3e54a16644 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 24 Dec 2019 17:01:15 +0800
Subject: [PATCH 34/52] Update and fix according to reviwer's suggetion

---
 tensorflow_addons/layers/BUILD       | 37 +++++++++------
 tensorflow_addons/layers/__init__.py |  2 +-
 tensorflow_addons/layers/crf.py      | 70 ++++++++++++----------------
 tensorflow_addons/losses/BUILD       | 39 ++++++++++------
 tensorflow_addons/losses/crf_test.py |  8 ++--
 tensorflow_addons/text/BUILD         |  7 +++
 6 files changed, 91 insertions(+), 72 deletions(-)

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index 89ccbf2476..334628c408 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -25,6 +25,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "crf",
+    srcs = [
+        "crf.py",
+    ],
+    deps = [
+        "//tensorflow_addons/text:crf",
+        "//tensorflow_addons/utils",
+    ],
+)
+
+py_test(
+    name = "crf_test",
+    size = "small",
+    srcs = [
+        "crf_test.py",
+    ],
+    main = "crf_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow_addons/layers:crf",
+    ],
+)
+
 py_test(
     name = "gelu_test",
     size = "small",
@@ -96,16 +120,3 @@ py_test(
         ":layers",
     ],
 )
-
-py_test(
-    name = "crf_test",
-    size = "small",
-    srcs = [
-        "crf_test.py",
-    ],
-    main = "crf_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":layers",
-    ],
-)
diff --git a/tensorflow_addons/layers/__init__.py b/tensorflow_addons/layers/__init__.py
index 4f8c6585eb..432259bcda 100644
--- a/tensorflow_addons/layers/__init__.py
+++ b/tensorflow_addons/layers/__init__.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow_addons.layers.crf import CRF
 from tensorflow_addons.layers.gelu import GeLU
 from tensorflow_addons.layers.maxout import Maxout
 from tensorflow_addons.layers.normalizations import GroupNormalization
@@ -26,4 +27,3 @@
 from tensorflow_addons.layers.poincare import PoincareNormalize
 from tensorflow_addons.layers.sparsemax import Sparsemax
 from tensorflow_addons.layers.wrappers import WeightNormalization
-from tensorflow_addons.layers.crf import CRF
diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 0fb4fb58a2..929118ee6c 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -225,7 +225,6 @@ def build(self, input_shape):
                 constraint=self.boundary_constraint,
             )
 
-        # or directly call self.built = True
         super(CRF, self).build(input_shape)
 
     def call(self, inputs, mask=None, **kwargs):
@@ -250,8 +249,7 @@ def call(self, inputs, mask=None, **kwargs):
             no_left_padding = tf.math.reduce_all(first_mask)
             msg = "Currently, CRF layer do not support left padding"
             with tf.control_dependencies([
-                    tf.debugging.assert_equal(
-                        no_left_padding, tf.constant(True), message=msg)
+                tf.debugging.assert_equal(no_left_padding, tf.constant(True), message=msg)
             ]):
                 self.potentials = self._dense_layer(inputs)
         else:
@@ -292,9 +290,8 @@ def _get_sequence_length(self, input_, mask):
 
     def mask_to_sequence_length(self, mask):
         """compute sequence length from mask."""
-        sequence_length = tf.keras.backend.cast(
-            tf.keras.backend.sum(tf.keras.backend.cast(mask, tf.int8), 1),
-            tf.int64)
+        sequence_length = tf.cast(
+            tf.reduce_sum(tf.cast(mask, tf.int8), 1), tf.int64)
         return sequence_length
 
     @staticmethod
@@ -302,10 +299,9 @@ def _compute_mask_right_boundary(mask):
         """input mask: 0011100, output left_boundary: 0000100."""
         # shift mask to left by 1: 0011100 => 0111000
         offset = 1
-        left_shifted_mask = tf.keras.backend.concatenate(
+        left_shifted_mask = tf.concat(
             [mask[:, offset:],
-             tf.keras.backend.zeros_like(mask[:, :offset])],
-            axis=1)
+             tf.zeros_like(mask[:, :offset])], axis=1)
 
         # TODO(howl-anderson): for below code
         # Original code in keras_contrib:
@@ -319,7 +315,7 @@ def _compute_mask_right_boundary(mask):
         # mailed him already and waiting for reply.
 
         # 0011100 > 0111000 => 0000100
-        right_boundary = tf.keras.backend.greater(mask, left_shifted_mask)
+        right_boundary = tf.greater(mask, left_shifted_mask)
 
         return right_boundary
 
@@ -328,40 +324,37 @@ def _compute_mask_left_boundary(mask):
         """input mask: 0011100, output left_boundary: 0010000."""
         # shift mask to right by 1: 0011100 => 0001110
         offset = 1
-        right_shifted_mask = tf.keras.backend.concatenate(
-            [tf.keras.backend.zeros_like(mask[:, :offset]), mask[:, :-offset]],
-            axis=1)
+        right_shifted_mask = tf.concat(
+            [tf.zeros_like(mask[:, :offset]), mask[:, :-offset]], axis=1)
 
         # 0011100 > 0001110 => 0010000
-        left_boundary = tf.keras.backend.greater(
-            tf.dtypes.cast(mask, tf.int32),
-            tf.dtypes.cast(right_shifted_mask, tf.int32))
-        # left_boundary = tf.keras.backend.greater(mask, right_shifted_mask)
+        left_boundary = tf.greater(
+            tf.cast(mask, tf.int32), tf.cast(right_shifted_mask, tf.int32))
+        # left_boundary = tf.greater(mask, right_shifted_mask)
 
         return left_boundary
 
     def add_boundary_energy(self, potentials, mask, start, end):
-        def expend_scalar_to_3d(x):
-            # expend tensor from shape (x, ) to (1, 1, x)
-            return tf.keras.backend.expand_dims(
-                tf.keras.backend.expand_dims(x, 0), 0)
+        def expand_scalar_to_3d(x):
+            # expand tensor from shape (x, ) to (1, 1, x)
+            return tf.reshape(x, (1, 1, -1))
 
-        start = expend_scalar_to_3d(start)
-        end = expend_scalar_to_3d(end)
+        start = expand_scalar_to_3d(start)
+        end = expand_scalar_to_3d(end)
         if mask is None:
-            potentials = tf.keras.backend.concatenate(
+            potentials = tf.concat(
                 [potentials[:, :1, :] + start, potentials[:, 1:, :]], axis=1)
-            potentials = tf.keras.backend.concatenate(
+            potentials = tf.concat(
                 [potentials[:, :-1, :], potentials[:, -1:, :] + end], axis=1)
         else:
             mask = tf.keras.backend.expand_dims(
-                tf.keras.backend.cast(mask, start.dtype), axis=-1)
-            start_mask = tf.keras.backend.cast(
+                tf.cast(mask, start.dtype), axis=-1)
+            start_mask = tf.cast(
                 self._compute_mask_left_boundary(mask),
                 start.dtype,
             )
 
-            end_mask = tf.keras.backend.cast(
+            end_mask = tf.cast(
                 self._compute_mask_right_boundary(mask),
                 end.dtype,
             )
@@ -427,11 +420,10 @@ def compute_mask(self, input_, mask=None):
 
     def get_negative_log_likelihood(self, y_true):
         # TODO(howl-anderson): remove unnecessary typing cast
-        self.potentials = tf.keras.backend.cast(self.potentials, tf.float32)
-        y_true = tf.keras.backend.cast(y_true, tf.int32)
-        self.sequence_length = tf.keras.backend.cast(self.sequence_length,
-                                                     tf.int32)
-        # self.chain_kernel = tf.keras.backend.cast(self.chain_kernel,
+        self.potentials = tf.cast(self.potentials, tf.float32)
+        y_true = tf.cast(y_true, tf.int32)
+        self.sequence_length = tf.cast(self.sequence_length, tf.int32)
+        # self.chain_kernel = tf.cast(self.chain_kernel,
         #                                           tf.float32)
 
         log_likelihood, _ = crf_log_likelihood(
@@ -444,14 +436,12 @@ def get_loss(self, y_true, y_pred):
         return self.get_negative_log_likelihood(y_true)
 
     def get_accuracy(self, y_true, y_pred):
-        judge = tf.keras.backend.cast(
-            tf.keras.backend.equal(y_pred, y_true), tf.keras.backend.floatx())
+        judge = tf.cast(tf.equal(y_pred, y_true), tf.keras.backend.floatx())
         if self.mask is None:
-            return tf.keras.backend.mean(judge)
+            return tf.reduce_mean(judge)
         else:
-            mask = tf.keras.backend.cast(self.mask, tf.keras.backend.floatx())
-            return (tf.keras.backend.sum(judge * mask) /
-                    tf.keras.backend.sum(mask))
+            mask = tf.cast(self.mask, tf.keras.backend.floatx())
+            return (tf.reduce_sum(judge * mask) / tf.reduce_sum(mask))
 
     def _dense_layer(self, input_):
         if self.use_kernel:
@@ -460,7 +450,7 @@ def _dense_layer(self, input_):
         else:
             output = input_
 
-        return tf.keras.backend.cast(output, self.chain_kernel.dtype)
+        return tf.cast(output, self.chain_kernel.dtype)
 
     def __call__(self, inputs, *args, **kwargs):
         outputs = super(CRF, self).__call__(inputs, *args, **kwargs)
diff --git a/tensorflow_addons/losses/BUILD b/tensorflow_addons/losses/BUILD
index 04c41e812c..114ee82117 100644
--- a/tensorflow_addons/losses/BUILD
+++ b/tensorflow_addons/losses/BUILD
@@ -23,6 +23,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "crf",
+    srcs = [
+        "crf.py",
+    ],
+    deps = [
+        "//tensorflow_addons/layers:crf",
+        "//tensorflow_addons/utils",
+    ],
+)
+
 py_test(
     name = "contrastive_test",
     size = "small",
@@ -35,6 +46,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "crf_test",
+    size = "small",
+    srcs = [
+        "crf_test.py",
+    ],
+    main = "crf_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow_addons/layers:crf",
+        "//tensorflow_addons/losses:crf",
+    ],
+)
+
 py_test(
     name = "focal_loss_test",
     size = "small",
@@ -118,17 +143,3 @@ py_test(
         ":losses",
     ],
 )
-
-py_test(
-    name = "crf_test",
-    size = "small",
-    srcs = [
-        "crf_test.py",
-    ],
-    main = "crf_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":losses",
-        "//tensorflow_addons/layers",
-    ],
-)
diff --git a/tensorflow_addons/losses/crf_test.py b/tensorflow_addons/losses/crf_test.py
index 15339b7b49..a2279ec843 100644
--- a/tensorflow_addons/losses/crf_test.py
+++ b/tensorflow_addons/losses/crf_test.py
@@ -138,16 +138,16 @@ def test_model_fit(self):
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
     def test_dump_and_load(self):
-        self.skipTest("don't work now")
-
-        MODEL_PERSISTENCE_PATH = './test_saving_crf_model.h5'
+        tmp_dir = self.get_temp_dir()
+        MODEL_PERSISTENCE_PATH = os.path.join(tmp_dir,
+                                              'test_saving_crf_model.h5')
 
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
         model.compile(
             "adam",
-            loss=crf.ConditionalRandomFieldLoss(),
+            loss="Addons>crf_loss",
             metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
diff --git a/tensorflow_addons/text/BUILD b/tensorflow_addons/text/BUILD
index 37c488831b..8e7e1493d7 100644
--- a/tensorflow_addons/text/BUILD
+++ b/tensorflow_addons/text/BUILD
@@ -32,6 +32,13 @@ py_library(
     }),
 )
 
+py_library(
+    name = "crf",
+    srcs = [
+        "crf.py",
+    ],
+)
+
 py_test(
     name = "crf_test",
     size = "small",

From 989cf2a1a3d819b98ad7a6b8067989ac91749bd6 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 25 Dec 2019 10:59:31 +0800
Subject: [PATCH 35/52] Update for clean unnecessory type cast and use dense
 layer inside

---
 tensorflow_addons/layers/crf.py | 53 ++++++++++-----------------------
 1 file changed, 16 insertions(+), 37 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 929118ee6c..d68b41b6a5 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -163,9 +163,8 @@ def __init__(self,
         self.mask = None
 
         # global variable
-        self.kernel = None
         self.chain_kernel = None
-        self.bias = None
+        self._dense_layer = None
         self.left_boundary = None
         self.right_boundary = None
 
@@ -177,16 +176,6 @@ def build(self, input_shape):
 
         feature_size = input_shape[-1]
 
-        if self.use_kernel:
-            # weights that mapping arbitrary tensor to correct shape
-            self.kernel = self.add_weight(
-                shape=(feature_size, self.units),
-                name="kernel",
-                initializer=self.kernel_initializer,
-                regularizer=self.kernel_regularizer,
-                constraint=self.kernel_constraint,
-            )
-
         # weights that work as transfer probability of each tags
         self.chain_kernel = self.add_weight(
             shape=(self.units, self.units),
@@ -196,18 +185,6 @@ def build(self, input_shape):
             constraint=self.chain_constraint,
         )
 
-        # bias that works with self.kernel
-        if self.use_kernel and self.use_bias:
-            self.bias = self.add_weight(
-                shape=(self.units,),
-                name="bias",
-                initializer=self.bias_initializer,
-                regularizer=self.bias_regularizer,
-                constraint=self.bias_constraint,
-            )
-        else:
-            self.bias = 0
-
         # weight of <START> to tag probability and tag to <END> probability
         if self.use_boundary:
             self.left_boundary = self.add_weight(
@@ -225,6 +202,21 @@ def build(self, input_shape):
                 constraint=self.boundary_constraint,
             )
 
+        if self.use_kernel:
+            self._dense_layer = tf.keras.layers.Dense(
+                    units=self.units,
+                    activation=self.activation,
+                    use_bias=self.use_bias,
+                    bias_initializer=self.bias_initializer,
+                    kernel_regularizer=self.kernel_regularizer,
+                    bias_regularizer=self.bias_regularizer,
+                    kernel_constraint=self.kernel_constraint,
+                    bias_constraint=self.bias_constraint,
+                    dtype=self.dtype
+                )
+        else:
+            self._dense_layer = lambda x: tf.cast(x, dtype=self.dtype)
+
         super(CRF, self).build(input_shape)
 
     def call(self, inputs, mask=None, **kwargs):
@@ -419,12 +411,8 @@ def compute_mask(self, input_, mask=None):
         return mask
 
     def get_negative_log_likelihood(self, y_true):
-        # TODO(howl-anderson): remove unnecessary typing cast
-        self.potentials = tf.cast(self.potentials, tf.float32)
         y_true = tf.cast(y_true, tf.int32)
         self.sequence_length = tf.cast(self.sequence_length, tf.int32)
-        # self.chain_kernel = tf.cast(self.chain_kernel,
-        #                                           tf.float32)
 
         log_likelihood, _ = crf_log_likelihood(
             self.potentials, y_true, self.sequence_length, self.chain_kernel)
@@ -443,15 +431,6 @@ def get_accuracy(self, y_true, y_pred):
             mask = tf.cast(self.mask, tf.keras.backend.floatx())
             return (tf.reduce_sum(judge * mask) / tf.reduce_sum(mask))
 
-    def _dense_layer(self, input_):
-        if self.use_kernel:
-            output = self.activation(
-                tf.keras.backend.dot(input_, self.kernel) + self.bias)
-        else:
-            output = input_
-
-        return tf.cast(output, self.chain_kernel.dtype)
-
     def __call__(self, inputs, *args, **kwargs):
         outputs = super(CRF, self).__call__(inputs, *args, **kwargs)
 

From 484636ff4ceef942d2e7321a51fa84bb7c21d246 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Mon, 3 Feb 2020 11:04:14 +0800
Subject: [PATCH 36/52] Add some docs and helper script

---
 create_debug_docker.bash |  3 ++
 design_docs/crf.md       | 60 ++++++++++++++++++++++++++++++++++++++++
 design_docs/crf_usage.py |  3 ++
 run_debug_docker.bash    |  3 ++
 run_test_in_docker.bash  | 13 +++++++++
 5 files changed, 82 insertions(+)
 create mode 100755 create_debug_docker.bash
 create mode 100644 design_docs/crf.md
 create mode 100644 design_docs/crf_usage.py
 create mode 100755 run_debug_docker.bash
 create mode 100755 run_test_in_docker.bash

diff --git a/create_debug_docker.bash b/create_debug_docker.bash
new file mode 100755
index 0000000000..7f94f66101
--- /dev/null
+++ b/create_debug_docker.bash
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker run --name tf_addons -it -v ${PWD}:/addons -w /addons gcr.io/tensorflow-testing/nosla-ubuntu16.04-manylinux2010 /bin/bash
diff --git a/design_docs/crf.md b/design_docs/crf.md
new file mode 100644
index 0000000000..73b3b507c6
--- /dev/null
+++ b/design_docs/crf.md
@@ -0,0 +1,60 @@
+# Some technical selection in implementation of CRF layer
+## About CRF loss function
+currently the crf loss function is desinged as a seperated method/Class. 
+### Solution 1: standalone loss
+In usage it look like below
+
+```python
+from tensorflow_addons.layers import CRF
+from tensorflow_addons.losses import crf_loss
+
+model = Sequential()
+model.add(Embedding(3001, 300, mask_zero=True)
+
+crf = CRF(10)
+model.add(crf)
+
+model.compile('adam', loss=crf_loss)
+
+model.fit(x, y)
+```
+
+#### pros ####
+the standard way to use loss
+
+#### cons ####
+in the eager mode, there need a complicated patch to make this solution works. 
+
+### Solution 2: get from crf layer ###
+In usage it look like below
+
+```python
+from tensorflow_addons.layers import CRF
+
+model = Sequential()
+model.add(Embedding(3001, 300, mask_zero=True)
+
+crf = CRF(10)
+model.add(crf)
+
+crf_loss = crf.get_keras_loss()
+
+model.compile('adam', loss=crf_loss)
+
+model.fit(x, y)
+```
+
+#### pros ####
+easy to implement and no more need patch
+
+#### cons ####
+
+This solution has a shortage that this model can not be save and load from disk anymore.
+
+```python
+# Save the model
+model.save('path_to_my_model.h5')
+
+# Recreate the exact same model purely from the file
+new_model = keras.models.load_model('path_to_my_model.h5')
+```
diff --git a/design_docs/crf_usage.py b/design_docs/crf_usage.py
new file mode 100644
index 0000000000..d090113183
--- /dev/null
+++ b/design_docs/crf_usage.py
@@ -0,0 +1,3 @@
+from tensorflow import keras
+
+keras.models.load_model('path_to_my_model.h5')
diff --git a/run_debug_docker.bash b/run_debug_docker.bash
new file mode 100755
index 0000000000..a5007d4512
--- /dev/null
+++ b/run_debug_docker.bash
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker container start -ai tf_addons
diff --git a/run_test_in_docker.bash b/run_test_in_docker.bash
new file mode 100755
index 0000000000..b81fe204aa
--- /dev/null
+++ b/run_test_in_docker.bash
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+bazel test -c opt -k \
+      --test_timeout 300,450,1200,3600 \
+      --test_output=all \
+      --run_under=$(readlink -f tools/ci_testing/parallel_gpu_execute.sh) \
+      //tensorflow_addons/layers:crf_test
+
+bazel test -c opt -k \
+      --test_timeout 300,450,1200,3600 \
+      --test_output=all \
+      --run_under=$(readlink -f tools/ci_testing/parallel_gpu_execute.sh) \
+      //tensorflow_addons/losses:crf_test

From e7940bf28adc38c5a1bb3d4466edbf84adb02d62 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Thu, 6 Feb 2020 16:10:04 +0800
Subject: [PATCH 37/52] Update crf design doc

---
 design_docs/crf.md | 53 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/design_docs/crf.md b/design_docs/crf.md
index 73b3b507c6..c3e4433ca0 100644
--- a/design_docs/crf.md
+++ b/design_docs/crf.md
@@ -23,7 +23,22 @@ model.fit(x, y)
 the standard way to use loss
 
 #### cons ####
-in the eager mode, there need a complicated patch to make this solution works. 
+in the eager mode, there need override a private of base layer to make this solution works. 
+
+code:
+```python
+def __call__(self, inputs, *args, **kwargs):
+    outputs = super(CRF, self).__call__(inputs, *args, **kwargs)
+
+    # A hack that add _keras_history to EagerTensor, make it more like normal Tensor
+    for tensor in tf.nest.flatten(outputs):
+        if not hasattr(tensor, '_keras_history'):
+            tensor._keras_history = (self, 0, 0)
+
+    return outputs
+```
+
+Maybe this patch should submit to tensorflow-core which can also help others to implement a loss function easier for a complicated layer (such like CRF) 
 
 ### Solution 2: get from crf layer ###
 In usage it look like below
@@ -58,3 +73,39 @@ model.save('path_to_my_model.h5')
 # Recreate the exact same model purely from the file
 new_model = keras.models.load_model('path_to_my_model.h5')
 ```
+
+key code snippet of how load loss from disk (as h5 file) in the function `tensorflow_core.python.keras.saving.saving_utils.compile_args_from_training_config`.
+
+loss function must be a class or function that can load from default losses, global custom losses registry or custom_objects passed by user.
+
+Since the layer object was constructed in side the `load_model` function, there is no way to pass a loss object generated from a layer object though custom_objects.
+
+Also I think it even can not be saved to disk. TODO(howl-anderson): add more detailed code later.
+
+## About CRF loss
+
+### Solution 1: inherit from tf.keras.losses.Loss
+
+#### pros
+the recommended way to implement a "normal" loss
+
+#### cons
+
+according to the code around tensorflow_core/python/keras/engine/training.py:1651 
+`per_sample_losses` returned by `loss_fn.call(y_true, y_pred)` must (or can be converted to) have the same shape with `sample_weight` which default to output `mask` (tensorflow_core/python/keras/engine/training.py:1642) of CRF layer.
+
+but that is not possible because `per_sample_losses` is a 1d tensor and `mask` of CRF is a 2d tensor.
+
+One way to fix it is set output `mark` of crf layer to a 1d tensor, which make the mark is considered as not the same meaning as it's name.
+
+Other way is modified the output of loss class to make `per_sample_losses` to a 2d tensor and properly set the reduce property of the class. It so wired and break the semantic meaning of the interface, should considered to a bad idea.
+
+### Solution 2: implement loss as a function
+
+#### pros
+
+easy to implement and nothing breaks. `mark` property is still a meaningful tensor which standard as a mark.
+
+#### cons
+
+this is a old style way to implement a loss function, which is not the recommend way in TF 2.x.
\ No newline at end of file

From 68ffff34076dd18394974503f37eaa017d13e495 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 11 Feb 2020 17:25:10 +0800
Subject: [PATCH 38/52] Add CRF layer PoC notebook for PR

---
 design_docs/PoC_of_crf_layer.ipynb | 288 +++++++++++++++++++++++++++++
 1 file changed, 288 insertions(+)
 create mode 100644 design_docs/PoC_of_crf_layer.ipynb

diff --git a/design_docs/PoC_of_crf_layer.ipynb b/design_docs/PoC_of_crf_layer.ipynb
new file mode 100644
index 0000000000..fb3746f39a
--- /dev/null
+++ b/design_docs/PoC_of_crf_layer.ipynb
@@ -0,0 +1,288 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING: Logging before flag parsing goes to stderr.\n",
+      "W0211 17:23:38.296061 139691842963264 tpu_cluster_resolver.py:35] Falling back to tensorflow client, its recommended to install the cloud tpu client directly with pip install cloud-tpu-client .\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.python.keras.testing_utils import layer_test\n",
+    "\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Layer define"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### define the layer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tf.keras.utils.register_keras_serializable(package='dummy-package')\n",
+    "class DummyLayer(tf.keras.layers.Layer):\n",
+    "    # for each tensor, increase value i for each\n",
+    "    def __init__(self, i=1, trainable=True, name=None, dtype=None, dynamic=False, **kwargs):\n",
+    "        self.i = i\n",
+    "        super().__init__(trainable=trainable, name=name, dtype=dtype, dynamic=dynamic, **kwargs)\n",
+    "\n",
+    "    def build(self, input_shape):\n",
+    "        self.i_constant = tf.constant(self.i, tf.float32)\n",
+    "\n",
+    "        super().build(input_shape)\n",
+    "\n",
+    "    def call(self, inputs, **kwargs):\n",
+    "        output = tf.add(inputs, self.i_constant)\n",
+    "        return output\n",
+    "\n",
+    "    def get_config(self):\n",
+    "        custom_config = {\n",
+    "            \"i\": self.i\n",
+    "        }\n",
+    "\n",
+    "        base_config = super().get_config()\n",
+    "\n",
+    "        config = dict(list(base_config.items()) + list(custom_config.items()))\n",
+    "\n",
+    "        return config"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### test the layer, make sure it works"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W0211 17:23:39.144376 139691842963264 training_eager.py:274] The list of trainable weights is empty. Make sure that you are not setting model.trainable to False before compiling the model.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([6., 7., 8.], dtype=float32)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "layer_test(\n",
+    "    DummyLayer,\n",
+    "    kwargs={\"i\": 5},\n",
+    "    input_data=np.array([1, 2, 3], np.float32),\n",
+    "    expected_output=np.array([6, 7, 8], np.float32),\n",
+    "    expected_output_dtype=tf.float32,\n",
+    "    validate_training=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save and Load"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_data = np.array([1, 2, 3], np.float32)\n",
+    "expected_output = np.array([6, 7, 8], np.float32)\n",
+    "\n",
+    "x = tf.keras.layers.Input(shape=input_data.shape[1:], dtype=input_data.dtype)\n",
+    "\n",
+    "layer = DummyLayer(i=5, name=\"dummy\")\n",
+    "\n",
+    "model = tf.keras.models.Model(x, layer(x))\n",
+    "model.compile('rmsprop', 'mse')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"model_2\"\n",
+      "_________________________________________________________________\n",
+      "Layer (type)                 Output Shape              Param #   \n",
+      "=================================================================\n",
+      "input_2 (InputLayer)         [(None,)]                 0         \n",
+      "_________________________________________________________________\n",
+      "dummy (DummyLayer)           (None,)                   0         \n",
+      "=================================================================\n",
+      "Total params: 0\n",
+      "Trainable params: 0\n",
+      "Non-trainable params: 0\n",
+      "_________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### save model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(\"model.h5\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### load and test it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_model = tf.keras.models.load_model(\"model.h5\")\n",
+    "assert new_model.predict(np.array([1], np.float32)) == np.array([6], np.float32)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### load and test it with custom_object"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[6.]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "oops, custom layer instance is not used by model",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-8-54c09be85866>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mexpected\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat32\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpected\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mexpected\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat32\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"oops, custom layer instance is not used by model\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m: oops, custom layer instance is not used by model"
+     ]
+    }
+   ],
+   "source": [
+    "# NOTE: i is set to 2, not the default value 1 or 5 saved in .h5 file\n",
+    "layer = DummyLayer(i=2, name=\"dummy\")\n",
+    "\n",
+    "new_model = tf.keras.models.load_model(\"model.h5\", custom_objects={\"dummy\": layer})\n",
+    "\n",
+    "expected = new_model.predict(np.array([1], np.float32))\n",
+    "print(expected)\n",
+    "assert expected == np.array([3], np.float32), \"oops, custom layer instance is not used by model\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove model file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm model.h5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  },
+  "name": "Untitled.ipynb"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 1d4e98c0a81765ddaca6499d52c40572508b684d Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 12 Feb 2020 10:45:09 +0800
Subject: [PATCH 39/52] Remove useless files

---
 tensorflow_addons/layers/old_crf.py    | 573 -------------------------
 tensorflow_addons/losses/crf_losses.py |  53 ---
 2 files changed, 626 deletions(-)
 delete mode 100644 tensorflow_addons/layers/old_crf.py
 delete mode 100644 tensorflow_addons/losses/crf_losses.py

diff --git a/tensorflow_addons/layers/old_crf.py b/tensorflow_addons/layers/old_crf.py
deleted file mode 100644
index 180f97c132..0000000000
--- a/tensorflow_addons/layers/old_crf.py
+++ /dev/null
@@ -1,573 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-
-import warnings
-
-from keras import backend as K
-from keras import activations
-from keras import initializers
-from keras import regularizers
-from keras import constraints
-from keras.layers import Layer
-from keras.layers import InputSpec
-
-from keras_contrib.losses import crf_loss
-from keras_contrib.metrics import crf_marginal_accuracy
-from keras_contrib.metrics import crf_viterbi_accuracy
-from keras_contrib.utils.test_utils import to_tuple
-
-
-class CRF(Layer):
-    """An implementation of linear chain conditional random field (CRF).
-    An linear chain CRF is defined to maximize the following likelihood function:
-    $$ L(W, U, b; y_1, ..., y_n) := \frac{1}{Z}
-    \sum_{y_1, ..., y_n} \exp(-a_1' y_1 - a_n' y_n
-        - \sum_{k=1^n}((f(x_k' W + b) y_k) + y_1' U y_2)), $$
-    where:
-        $Z$: normalization constant
-        $x_k, y_k$:  inputs and outputs
-    This implementation has two modes for optimization:
-    1. (`join mode`) optimized by maximizing join likelihood,
-    which is optimal in theory of statistics.
-       Note that in this case, CRF must be the output/last layer.
-    2. (`marginal mode`) return marginal probabilities on each time
-    step and optimized via composition
-       likelihood (product of marginal likelihood), i.e.,
-       using `categorical_crossentropy` loss.
-       Note that in this case, CRF can be either the last layer or an
-       intermediate layer (though not explored).
-    For prediction (test phrase), one can choose either Viterbi
-    best path (class indices) or marginal
-    probabilities if probabilities are needed.
-    However, if one chooses *join mode* for training,
-    Viterbi output is typically better than marginal output,
-    but the marginal output will still perform
-    reasonably close, while if *marginal mode* is used for training,
-    marginal output usually performs
-    much better. The default behavior and `metrics.crf_accuracy`
-    is set according to this observation.
-    In addition, this implementation supports masking and accepts either
-    onehot or sparse target.
-    If you open a issue or a pull request about CRF, please
-    add 'cc @lzfelix' to notify Luiz Felix.
-    # Examples
-    ```python
-        from keras_contrib.layers import CRF
-        from keras_contrib.losses import crf_loss
-        from keras_contrib.metrics import crf_viterbi_accuracy
-        model = Sequential()
-        model.add(Embedding(3001, 300, mask_zero=True)(X)
-        # use learn_mode = 'join', test_mode = 'viterbi',
-        # sparse_target = True (label indice output)
-        crf = CRF(10, sparse_target=True)
-        model.add(crf)
-        # crf_accuracy is default to Viterbi acc if using join-mode (default).
-        # One can add crf.marginal_acc if interested, but may slow down learning
-        model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
-        # y must be label indices (with shape 1 at dim 3) here,
-        # since `sparse_target=True`
-        model.fit(x, y)
-        # prediction give onehot representation of Viterbi best path
-        y_hat = model.predict(x_test)
-    ```
-    The following snippet shows how to load a persisted
-    model that uses the CRF layer:
-    ```python
-        from keras.models import load_model
-        from keras_contrib.losses import import crf_loss
-        from keras_contrib.metrics import crf_viterbi_accuracy
-        custom_objects={'CRF': CRF,
-                        'crf_loss': crf_loss,
-                        'crf_viterbi_accuracy': crf_viterbi_accuracy}
-        loaded_model = load_model('<path_to_model>',
-                                  custom_objects=custom_objects)
-    ```
-    # Arguments
-        units: Positive integer, dimensionality of the output space.
-        learn_mode: Either 'join' or 'marginal'.
-            The former train the model by maximizing join likelihood while the latter
-            maximize the product of marginal likelihood over all time steps.
-            One should use `losses.crf_nll` for 'join' mode
-            and `losses.categorical_crossentropy` or
-            `losses.sparse_categorical_crossentropy` for
-            `marginal` mode.  For convenience, simply
-            use `losses.crf_loss`, which will decide the proper loss as described.
-        test_mode: Either 'viterbi' or 'marginal'.
-            The former is recommended and as default when `learn_mode = 'join'` and
-            gives one-hot representation of the best path at test (prediction) time,
-            while the latter is recommended and chosen as default
-            when `learn_mode = 'marginal'`,
-            which produces marginal probabilities for each time step.
-            For evaluating metrics, one should
-            use `metrics.crf_viterbi_accuracy` for 'viterbi' mode and
-            'metrics.crf_marginal_accuracy' for 'marginal' mode, or
-            simply use `metrics.crf_accuracy` for
-            both which automatically decides it as described.
-            One can also use both for evaluation at training.
-        sparse_target: Boolean (default False) indicating
-            if provided labels are one-hot or
-            indices (with shape 1 at dim 3).
-        use_boundary: Boolean (default True) indicating if trainable
-            start-end chain energies
-            should be added to model.
-        use_bias: Boolean, whether the layer uses a bias vector.
-        kernel_initializer: Initializer for the `kernel` weights matrix,
-            used for the linear transformation of the inputs.
-            (see [initializers](../initializers.md)).
-        chain_initializer: Initializer for the `chain_kernel` weights matrix,
-            used for the CRF chain energy.
-            (see [initializers](../initializers.md)).
-        boundary_initializer: Initializer for the `left_boundary`,
-            'right_boundary' weights vectors,
-            used for the start/left and end/right boundary energy.
-            (see [initializers](../initializers.md)).
-        bias_initializer: Initializer for the bias vector
-            (see [initializers](../initializers.md)).
-        activation: Activation function to use
-            (see [activations](../activations.md)).
-            If you pass None, no activation is applied
-            (ie. "linear" activation: `a(x) = x`).
-        kernel_regularizer: Regularizer function applied to
-            the `kernel` weights matrix
-            (see [regularizer](../regularizers.md)).
-        chain_regularizer: Regularizer function applied to
-            the `chain_kernel` weights matrix
-            (see [regularizer](../regularizers.md)).
-        boundary_regularizer: Regularizer function applied to
-            the 'left_boundary', 'right_boundary' weight vectors
-            (see [regularizer](../regularizers.md)).
-        bias_regularizer: Regularizer function applied to the bias vector
-            (see [regularizer](../regularizers.md)).
-        kernel_constraint: Constraint function applied to
-            the `kernel` weights matrix
-            (see [constraints](../constraints.md)).
-        chain_constraint: Constraint function applied to
-            the `chain_kernel` weights matrix
-            (see [constraints](../constraints.md)).
-        boundary_constraint: Constraint function applied to
-            the `left_boundary`, `right_boundary` weights vectors
-            (see [constraints](../constraints.md)).
-        bias_constraint: Constraint function applied to the bias vector
-            (see [constraints](../constraints.md)).
-        input_dim: dimensionality of the input (integer).
-            This argument (or alternatively, the keyword argument `input_shape`)
-            is required when using this layer as the first layer in a model.
-        unroll: Boolean (default False). If True, the network will be
-            unrolled, else a symbolic loop will be used.
-            Unrolling can speed-up a RNN, although it tends
-            to be more memory-intensive.
-            Unrolling is only suitable for short sequences.
-    # Input shape
-        3D tensor with shape `(nb_samples, timesteps, input_dim)`.
-    # Output shape
-        3D tensor with shape `(nb_samples, timesteps, units)`.
-    # Masking
-        This layer supports masking for input data with a variable number
-        of timesteps. To introduce masks to your data,
-        use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
-        set to `True`.
-    """
-
-    def __init__(self, units,
-                 learn_mode='join',
-                 test_mode=None,
-                 sparse_target=False,
-                 use_boundary=True,
-                 use_bias=True,
-                 activation='linear',
-                 kernel_initializer='glorot_uniform',
-                 chain_initializer='orthogonal',
-                 bias_initializer='zeros',
-                 boundary_initializer='zeros',
-                 kernel_regularizer=None,
-                 chain_regularizer=None,
-                 boundary_regularizer=None,
-                 bias_regularizer=None,
-                 kernel_constraint=None,
-                 chain_constraint=None,
-                 boundary_constraint=None,
-                 bias_constraint=None,
-                 input_dim=None,
-                 unroll=False,
-                 **kwargs):
-        super(CRF, self).__init__(**kwargs)
-        self.supports_masking = True
-        self.units = units
-        self.learn_mode = learn_mode
-        assert self.learn_mode in ['join', 'marginal']
-        self.test_mode = test_mode
-        if self.test_mode is None:
-            self.test_mode = 'viterbi' if self.learn_mode == 'join' else 'marginal'
-        else:
-            assert self.test_mode in ['viterbi', 'marginal']
-        self.sparse_target = sparse_target
-        self.use_boundary = use_boundary
-        self.use_bias = use_bias
-
-        self.activation = activations.get(activation)
-
-        self.kernel_initializer = initializers.get(kernel_initializer)
-        self.chain_initializer = initializers.get(chain_initializer)
-        self.boundary_initializer = initializers.get(boundary_initializer)
-        self.bias_initializer = initializers.get(bias_initializer)
-
-        self.kernel_regularizer = regularizers.get(kernel_regularizer)
-        self.chain_regularizer = regularizers.get(chain_regularizer)
-        self.boundary_regularizer = regularizers.get(boundary_regularizer)
-        self.bias_regularizer = regularizers.get(bias_regularizer)
-
-        self.kernel_constraint = constraints.get(kernel_constraint)
-        self.chain_constraint = constraints.get(chain_constraint)
-        self.boundary_constraint = constraints.get(boundary_constraint)
-        self.bias_constraint = constraints.get(bias_constraint)
-
-        self.unroll = unroll
-
-    def build(self, input_shape):
-        input_shape = to_tuple(input_shape)
-        self.input_spec = [InputSpec(shape=input_shape)]
-        self.input_dim = input_shape[-1]
-
-        self.kernel = self.add_weight(shape=(self.input_dim, self.units),
-                                      name='kernel',
-                                      initializer=self.kernel_initializer,
-                                      regularizer=self.kernel_regularizer,
-                                      constraint=self.kernel_constraint)
-        self.chain_kernel = self.add_weight(shape=(self.units, self.units),
-                                            name='chain_kernel',
-                                            initializer=self.chain_initializer,
-                                            regularizer=self.chain_regularizer,
-                                            constraint=self.chain_constraint)
-        if self.use_bias:
-            self.bias = self.add_weight(shape=(self.units,),
-                                        name='bias',
-                                        initializer=self.bias_initializer,
-                                        regularizer=self.bias_regularizer,
-                                        constraint=self.bias_constraint)
-        else:
-            self.bias = 0
-
-        if self.use_boundary:
-            self.left_boundary = self.add_weight(shape=(self.units,),
-                                                 name='left_boundary',
-                                                 initializer=self.boundary_initializer,
-                                                 regularizer=self.boundary_regularizer,
-                                                 constraint=self.boundary_constraint)
-            self.right_boundary = self.add_weight(shape=(self.units,),
-                                                  name='right_boundary',
-                                                  initializer=self.boundary_initializer,
-                                                  regularizer=self.boundary_regularizer,
-                                                  constraint=self.boundary_constraint)
-        self.built = True
-
-    def call(self, X, mask=None):
-        if mask is not None:
-            assert K.ndim(mask) == 2, 'Input mask to CRF must have dim 2 if not None'
-
-        if self.test_mode == 'viterbi':
-            test_output = self.viterbi_decoding(X, mask)
-        else:
-            test_output = self.get_marginal_prob(X, mask)
-
-        self.uses_learning_phase = True
-        if self.learn_mode == 'join':
-            train_output = K.zeros_like(K.dot(X, self.kernel))
-            out = K.in_train_phase(train_output, test_output)
-        else:
-            if self.test_mode == 'viterbi':
-                train_output = self.get_marginal_prob(X, mask)
-                out = K.in_train_phase(train_output, test_output)
-            else:
-                out = test_output
-        return out
-
-    def compute_output_shape(self, input_shape):
-        return input_shape[:2] + (self.units,)
-
-    def compute_mask(self, input, mask=None):
-        if mask is not None and self.learn_mode == 'join':
-            return K.any(mask, axis=1)
-        return mask
-
-    def get_config(self):
-        config = {
-            'units': self.units,
-            'learn_mode': self.learn_mode,
-            'test_mode': self.test_mode,
-            'use_boundary': self.use_boundary,
-            'use_bias': self.use_bias,
-            'sparse_target': self.sparse_target,
-            'kernel_initializer': initializers.serialize(self.kernel_initializer),
-            'chain_initializer': initializers.serialize(self.chain_initializer),
-            'boundary_initializer': initializers.serialize(
-                self.boundary_initializer),
-            'bias_initializer': initializers.serialize(self.bias_initializer),
-            'activation': activations.serialize(self.activation),
-            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-            'chain_regularizer': regularizers.serialize(self.chain_regularizer),
-            'boundary_regularizer': regularizers.serialize(
-                self.boundary_regularizer),
-            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'kernel_constraint': constraints.serialize(self.kernel_constraint),
-            'chain_constraint': constraints.serialize(self.chain_constraint),
-            'boundary_constraint': constraints.serialize(self.boundary_constraint),
-            'bias_constraint': constraints.serialize(self.bias_constraint),
-            'input_dim': self.input_dim,
-            'unroll': self.unroll}
-        base_config = super(CRF, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    @property
-    def loss_function(self):
-        warnings.warn('CRF.loss_function is deprecated '
-                      'and it might be removed in the future. Please '
-                      'use losses.crf_loss instead.')
-        return crf_loss
-
-    @property
-    def accuracy(self):
-        warnings.warn('CRF.accuracy is deprecated and it '
-                      'might be removed in the future. Please '
-                      'use metrics.crf_accuracy')
-        if self.test_mode == 'viterbi':
-            return crf_viterbi_accuracy
-        else:
-            return crf_marginal_accuracy
-
-    @property
-    def viterbi_acc(self):
-        warnings.warn('CRF.viterbi_acc is deprecated and it might '
-                      'be removed in the future. Please '
-                      'use metrics.viterbi_acc instead.')
-        return crf_viterbi_accuracy
-
-    @property
-    def marginal_acc(self):
-        warnings.warn('CRF.moarginal_acc is deprecated and it '
-                      'might be removed in the future. Please '
-                      'use metrics.marginal_acc instead.')
-        return crf_marginal_accuracy
-
-    @staticmethod
-    def softmaxNd(x, axis=-1):
-        m = K.max(x, axis=axis, keepdims=True)
-        exp_x = K.exp(x - m)
-        prob_x = exp_x / K.sum(exp_x, axis=axis, keepdims=True)
-        return prob_x
-
-    @staticmethod
-    def shift_left(x, offset=1):
-        assert offset > 0
-        return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)
-
-    @staticmethod
-    def shift_right(x, offset=1):
-        assert offset > 0
-        return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)
-
-    def add_boundary_energy(self, energy, mask, start, end):
-        start = K.expand_dims(K.expand_dims(start, 0), 0)
-        end = K.expand_dims(K.expand_dims(end, 0), 0)
-        if mask is None:
-            energy = K.concatenate([energy[:, :1, :] + start, energy[:, 1:, :]],
-                                   axis=1)
-            energy = K.concatenate([energy[:, :-1, :], energy[:, -1:, :] + end],
-                                   axis=1)
-        else:
-            mask = K.expand_dims(K.cast(mask, K.floatx()))
-            start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
-            end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
-            energy = energy + start_mask * start
-            energy = energy + end_mask * end
-        return energy
-
-    def get_log_normalization_constant(self, input_energy, mask, **kwargs):
-        """Compute logarithm of the normalization constant Z, where
-        Z = sum exp(-E) -> logZ = log sum exp(-E) =: -nlogZ
-        """
-        # should have logZ[:, i] == logZ[:, j] for any i, j
-        logZ = self.recursion(input_energy, mask, return_sequences=False, **kwargs)
-        return logZ[:, 0]
-
-    def get_energy(self, y_true, input_energy, mask):
-        """Energy = a1' y1 + u1' y1 + y1' U y2 + u2' y2 + y2' U y3 + u3' y3 + an' y3
-        """
-        input_energy = K.sum(input_energy * y_true, 2)  # (B, T)
-        # (B, T-1)
-        chain_energy = K.sum(K.dot(y_true[:, :-1, :],
-                                   self.chain_kernel) * y_true[:, 1:, :], 2)
-
-        if mask is not None:
-            mask = K.cast(mask, K.floatx())
-            # (B, T-1), mask[:,:-1]*mask[:,1:] makes it work with any padding
-            chain_mask = mask[:, :-1] * mask[:, 1:]
-            input_energy = input_energy * mask
-            chain_energy = chain_energy * chain_mask
-        total_energy = K.sum(input_energy, -1) + K.sum(chain_energy, -1)  # (B, )
-
-        return total_energy
-
-    def get_negative_log_likelihood(self, y_true, X, mask):
-        """Compute the loss, i.e., negative log likelihood (normalize by number of time steps)
-           likelihood = 1/Z * exp(-E) ->  neg_log_like = - log(1/Z * exp(-E)) = logZ + E
-        """
-        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
-        if self.use_boundary:
-            input_energy = self.add_boundary_energy(input_energy, mask,
-                                                    self.left_boundary,
-                                                    self.right_boundary)
-        energy = self.get_energy(y_true, input_energy, mask)
-        logZ = self.get_log_normalization_constant(input_energy, mask,
-                                                   input_length=K.int_shape(X)[1])
-        nloglik = logZ + energy
-        if mask is not None:
-            nloglik = nloglik / K.sum(K.cast(mask, K.floatx()), 1)
-        else:
-            nloglik = nloglik / K.cast(K.shape(X)[1], K.floatx())
-        return nloglik
-
-    def step(self, input_energy_t, states, return_logZ=True):
-        # not in the following  `prev_target_val` has shape = (B, F)
-        # where B = batch_size, F = output feature dim
-        # Note: `i` is of float32, due to the behavior of `K.rnn`
-        prev_target_val, i, chain_energy = states[:3]
-        t = K.cast(i[0, 0], dtype='int32')
-        if len(states) > 3:
-            if K.backend() == 'theano':
-                m = states[3][:, t:(t + 2)]
-            else:
-                m = K.slice(states[3], [0, t], [-1, 2])
-            input_energy_t = input_energy_t * K.expand_dims(m[:, 0])
-            # (1, F, F)*(B, 1, 1) -> (B, F, F)
-            chain_energy = chain_energy * K.expand_dims(
-                K.expand_dims(m[:, 0] * m[:, 1]))
-        if return_logZ:
-            # shapes: (1, B, F) + (B, F, 1) -> (B, F, F)
-            energy = chain_energy + K.expand_dims(input_energy_t - prev_target_val, 2)
-            new_target_val = K.logsumexp(-energy, 1)  # shapes: (B, F)
-            return new_target_val, [new_target_val, i + 1]
-        else:
-            energy = chain_energy + K.expand_dims(input_energy_t + prev_target_val, 2)
-            min_energy = K.min(energy, 1)
-            # cast for tf-version `K.rnn
-            argmin_table = K.cast(K.argmin(energy, 1), K.floatx())
-            return argmin_table, [min_energy, i + 1]
-
-    def recursion(self, input_energy, mask=None, go_backwards=False,
-                  return_sequences=True, return_logZ=True, input_length=None):
-        """Forward (alpha) or backward (beta) recursion
-        If `return_logZ = True`, compute the logZ, the normalization constant:
-        \[ Z = \sum_{y1, y2, y3} exp(-E) # energy
-          = \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3))
-          = sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3))
-          sum_{y1} exp(-(u1' y1' + y1' W y2))) \]
-        Denote:
-            \[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \]
-            \[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \]
-            \[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \]
-        Note that:
-              yi's are one-hot vectors
-              u1, u3: boundary energies have been merged
-        If `return_logZ = False`, compute the Viterbi's best path lookup table.
-        """
-        chain_energy = self.chain_kernel
-        # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t
-        chain_energy = K.expand_dims(chain_energy, 0)
-        # shape=(B, F), dtype=float32
-        prev_target_val = K.zeros_like(input_energy[:, 0, :])
-
-        if go_backwards:
-            input_energy = K.reverse(input_energy, 1)
-            if mask is not None:
-                mask = K.reverse(mask, 1)
-
-        initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])]
-        constants = [chain_energy]
-
-        if mask is not None:
-            mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
-                           K.floatx())
-            constants.append(mask2)
-
-        def _step(input_energy_i, states):
-            return self.step(input_energy_i, states, return_logZ)
-
-        target_val_last, target_val_seq, _ = K.rnn(_step, input_energy,
-                                                   initial_states,
-                                                   constants=constants,
-                                                   input_length=input_length,
-                                                   unroll=self.unroll)
-
-        if return_sequences:
-            if go_backwards:
-                target_val_seq = K.reverse(target_val_seq, 1)
-            return target_val_seq
-        else:
-            return target_val_last
-
-    def forward_recursion(self, input_energy, **kwargs):
-        return self.recursion(input_energy, **kwargs)
-
-    def backward_recursion(self, input_energy, **kwargs):
-        return self.recursion(input_energy, go_backwards=True, **kwargs)
-
-    def get_marginal_prob(self, X, mask=None):
-        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
-        if self.use_boundary:
-            input_energy = self.add_boundary_energy(input_energy, mask,
-                                                    self.left_boundary,
-                                                    self.right_boundary)
-        input_length = K.int_shape(X)[1]
-        alpha = self.forward_recursion(input_energy, mask=mask,
-                                       input_length=input_length)
-        beta = self.backward_recursion(input_energy, mask=mask,
-                                       input_length=input_length)
-        if mask is not None:
-            input_energy = input_energy * K.expand_dims(K.cast(mask, K.floatx()))
-        margin = -(self.shift_right(alpha) + input_energy + self.shift_left(beta))
-        return self.softmaxNd(margin)
-
-    def viterbi_decoding(self, X, mask=None):
-        input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
-        if self.use_boundary:
-            input_energy = self.add_boundary_energy(
-                input_energy, mask, self.left_boundary, self.right_boundary)
-
-        argmin_tables = self.recursion(input_energy, mask, return_logZ=False)
-        argmin_tables = K.cast(argmin_tables, 'int32')
-
-        # backward to find best path, `initial_best_idx` can be any,
-        # as all elements in the last argmin_table are the same
-        argmin_tables = K.reverse(argmin_tables, 1)
-        # matrix instead of vector is required by tf `K.rnn`
-        initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])]
-        if K.backend() == 'theano':
-            from theano import tensor as T
-            initial_best_idx = [T.unbroadcast(initial_best_idx[0], 1)]
-
-        def gather_each_row(params, indices):
-            n = K.shape(indices)[0]
-            if K.backend() == 'theano':
-                from theano import tensor as T
-                return params[T.arange(n), indices]
-            elif K.backend() == 'tensorflow':
-                import tensorflow as tf
-                indices = K.transpose(K.stack([tf.range(n), indices]))
-                return tf.gather_nd(params, indices)
-            else:
-                raise NotImplementedError
-
-        def find_path(argmin_table, best_idx):
-            next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0])
-            next_best_idx = K.expand_dims(next_best_idx)
-            if K.backend() == 'theano':
-                from theano import tensor as T
-                next_best_idx = T.unbroadcast(next_best_idx, 1)
-            return next_best_idx, [next_best_idx]
-
-        _, best_paths, _ = K.rnn(find_path, argmin_tables, initial_best_idx,
-                                 input_length=K.int_shape(X)[1], unroll=self.unroll)
-        best_paths = K.reverse(best_paths, 1)
-        best_paths = K.squeeze(best_paths, 2)
-
-        return K.one_hot(best_paths, self.units)
\ No newline at end of file
diff --git a/tensorflow_addons/losses/crf_losses.py b/tensorflow_addons/losses/crf_losses.py
deleted file mode 100644
index 5729f7eb60..0000000000
--- a/tensorflow_addons/losses/crf_losses.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from keras import backend as K
-from keras.losses import categorical_crossentropy
-from keras.losses import sparse_categorical_crossentropy
-
-
-def crf_nll(y_true, y_pred):
-    """The negative log-likelihood for linear chain Conditional Random Field (CRF).
-    This loss function is only used when the `layers.CRF` layer
-    is trained in the "join" mode.
-    # Arguments
-        y_true: tensor with true targets.
-        y_pred: tensor with predicted targets.
-    # Returns
-        A scalar representing corresponding to the negative log-likelihood.
-    # Raises
-        TypeError: If CRF is not the last layer.
-    # About GitHub
-        If you open an issue or a pull request about CRF, please
-        add `cc @lzfelix` to notify Luiz Felix.
-    """
-
-    crf, idx = y_pred._keras_history[:2]
-    if crf._outbound_nodes:
-        raise TypeError('When learn_model="join", CRF must be the last layer.')
-    if crf.sparse_target:
-        y_true = K.one_hot(K.cast(y_true[:, :, 0], 'int32'), crf.units)
-    X = crf._inbound_nodes[idx].input_tensors[0]
-    mask = crf._inbound_nodes[idx].input_masks[0]
-    nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
-    return nloglik
-
-
-def crf_loss(y_true, y_pred):
-    """General CRF loss function depending on the learning mode.
-    # Arguments
-        y_true: tensor with true targets.
-        y_pred: tensor with predicted targets.
-    # Returns
-        If the CRF layer is being trained in the join mode, returns the negative
-        log-likelihood. Otherwise returns the categorical crossentropy implemented
-        by the underlying Keras backend.
-    # About GitHub
-        If you open an issue or a pull request about CRF, please
-        add `cc @lzfelix` to notify Luiz Felix.
-    """
-    crf, idx = y_pred._keras_history[:2]
-    if crf.learn_mode == 'join':
-        return crf_nll(y_true, y_pred)
-    else:
-        if crf.sparse_target:
-            return sparse_categorical_crossentropy(y_true, y_pred)
-        else:
-            return categorical_crossentropy(y_true, y_pred)

From fc6527ef77d14986274db231fbebfaf7e8f83e7a Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 12 Feb 2020 13:49:45 +0800
Subject: [PATCH 40/52] fix code style

---
 tensorflow_addons/layers/crf.py      | 206 +++++++++++++--------------
 tensorflow_addons/layers/crf_test.py |  20 +--
 tensorflow_addons/losses/crf.py      |  12 +-
 tensorflow_addons/losses/crf_test.py |  92 ++++++------
 4 files changed, 159 insertions(+), 171 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index d68b41b6a5..305ce424d2 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -16,15 +16,14 @@
 # ==============================================================================
 """Implementing Conditional Random Field layer."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import tensorflow as tf
+
 from tensorflow_addons.text.crf import crf_decode, crf_log_likelihood
 
 
-@tf.keras.utils.register_keras_serializable(package='Addons')
+@tf.keras.utils.register_keras_serializable(package="Addons")
 class CRF(tf.keras.layers.Layer):
     """Linear chain conditional random field (CRF).
 
@@ -102,25 +101,27 @@ class CRF(tf.keras.layers.Layer):
         - [Conditional Random Field](https://en.wikipedia.org/wiki/Conditional_random_field)
     """
 
-    def __init__(self,
-                 units,
-                 chain_initializer="orthogonal",
-                 chain_regularizer=None,
-                 chain_constraint=None,
-                 use_boundary=True,
-                 boundary_initializer="zeros",
-                 boundary_regularizer=None,
-                 boundary_constraint=None,
-                 use_kernel=True,
-                 kernel_initializer="glorot_uniform",
-                 kernel_regularizer=None,
-                 kernel_constraint=None,
-                 use_bias=True,
-                 bias_initializer="zeros",
-                 bias_regularizer=None,
-                 bias_constraint=None,
-                 activation="linear",
-                 **kwargs):
+    def __init__(
+        self,
+        units,
+        chain_initializer="orthogonal",
+        chain_regularizer=None,
+        chain_constraint=None,
+        use_boundary=True,
+        boundary_initializer="zeros",
+        boundary_regularizer=None,
+        boundary_constraint=None,
+        use_kernel=True,
+        kernel_initializer="glorot_uniform",
+        kernel_regularizer=None,
+        kernel_constraint=None,
+        use_bias=True,
+        bias_initializer="zeros",
+        bias_regularizer=None,
+        bias_constraint=None,
+        activation="linear",
+        **kwargs,
+    ):
         super(CRF, self).__init__(**kwargs)
 
         # setup mask supporting flag, used by base class (the Layer)
@@ -138,20 +139,17 @@ def __init__(self,
 
         self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
         self.chain_initializer = tf.keras.initializers.get(chain_initializer)
-        self.boundary_initializer = tf.keras.initializers.get(
-            boundary_initializer)
+        self.boundary_initializer = tf.keras.initializers.get(boundary_initializer)
         self.bias_initializer = tf.keras.initializers.get(bias_initializer)
 
         self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
         self.chain_regularizer = tf.keras.regularizers.get(chain_regularizer)
-        self.boundary_regularizer = tf.keras.regularizers.get(
-            boundary_regularizer)
+        self.boundary_regularizer = tf.keras.regularizers.get(boundary_regularizer)
         self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
 
         self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
         self.chain_constraint = tf.keras.constraints.get(chain_constraint)
-        self.boundary_constraint = tf.keras.constraints.get(
-            boundary_constraint)
+        self.boundary_constraint = tf.keras.constraints.get(boundary_constraint)
         self.bias_constraint = tf.keras.constraints.get(bias_constraint)
 
         # values will be assigned in method
@@ -174,8 +172,6 @@ def build(self, input_shape):
         # see API docs of InputSpec for more detail
         self.input_spec = [tf.keras.layers.InputSpec(shape=input_shape)]
 
-        feature_size = input_shape[-1]
-
         # weights that work as transfer probability of each tags
         self.chain_kernel = self.add_weight(
             shape=(self.units, self.units),
@@ -204,16 +200,16 @@ def build(self, input_shape):
 
         if self.use_kernel:
             self._dense_layer = tf.keras.layers.Dense(
-                    units=self.units,
-                    activation=self.activation,
-                    use_bias=self.use_bias,
-                    bias_initializer=self.bias_initializer,
-                    kernel_regularizer=self.kernel_regularizer,
-                    bias_regularizer=self.bias_regularizer,
-                    kernel_constraint=self.kernel_constraint,
-                    bias_constraint=self.bias_constraint,
-                    dtype=self.dtype
-                )
+                units=self.units,
+                activation=self.activation,
+                use_bias=self.use_bias,
+                bias_initializer=self.bias_initializer,
+                kernel_regularizer=self.kernel_regularizer,
+                bias_regularizer=self.bias_regularizer,
+                kernel_constraint=self.kernel_constraint,
+                bias_constraint=self.bias_constraint,
+                dtype=self.dtype,
+            )
         else:
             self._dense_layer = lambda x: tf.cast(x, dtype=self.dtype)
 
@@ -224,8 +220,7 @@ def call(self, inputs, mask=None, **kwargs):
 
         if mask is not None:
             if tf.keras.backend.ndim(mask) != 2:
-                raise ValueError(
-                    "Input mask to CRF must have dim 2 if not None")
+                raise ValueError("Input mask to CRF must have dim 2 if not None")
 
         # left padding of mask is not supported, due the underline CRF function
         # detect it and report it to user
@@ -240,9 +235,13 @@ def call(self, inputs, mask=None, **kwargs):
         if first_mask is not None:
             no_left_padding = tf.math.reduce_all(first_mask)
             msg = "Currently, CRF layer do not support left padding"
-            with tf.control_dependencies([
-                tf.debugging.assert_equal(no_left_padding, tf.constant(True), message=msg)
-            ]):
+            with tf.control_dependencies(
+                [
+                    tf.debugging.assert_equal(
+                        no_left_padding, tf.constant(True), message=msg
+                    )
+                ]
+            ):
                 self.potentials = self._dense_layer(inputs)
         else:
             self.potentials = self._dense_layer(inputs)
@@ -250,12 +249,14 @@ def call(self, inputs, mask=None, **kwargs):
         # appending boundary probability info
         if self.use_boundary:
             self.potentials = self.add_boundary_energy(
-                self.potentials, mask, self.left_boundary, self.right_boundary)
+                self.potentials, mask, self.left_boundary, self.right_boundary
+            )
 
         self.sequence_length = self._get_sequence_length(inputs, mask)
 
         decoded_sequence, _ = self.get_viterbi_decoding(
-            self.potentials, self.sequence_length)
+            self.potentials, self.sequence_length
+        )
 
         return decoded_sequence
 
@@ -282,8 +283,7 @@ def _get_sequence_length(self, input_, mask):
 
     def mask_to_sequence_length(self, mask):
         """compute sequence length from mask."""
-        sequence_length = tf.cast(
-            tf.reduce_sum(tf.cast(mask, tf.int8), 1), tf.int64)
+        sequence_length = tf.cast(tf.reduce_sum(tf.cast(mask, tf.int8), 1), tf.int64)
         return sequence_length
 
     @staticmethod
@@ -292,8 +292,8 @@ def _compute_mask_right_boundary(mask):
         # shift mask to left by 1: 0011100 => 0111000
         offset = 1
         left_shifted_mask = tf.concat(
-            [mask[:, offset:],
-             tf.zeros_like(mask[:, :offset])], axis=1)
+            [mask[:, offset:], tf.zeros_like(mask[:, :offset])], axis=1
+        )
 
         # TODO(howl-anderson): for below code
         # Original code in keras_contrib:
@@ -317,11 +317,13 @@ def _compute_mask_left_boundary(mask):
         # shift mask to right by 1: 0011100 => 0001110
         offset = 1
         right_shifted_mask = tf.concat(
-            [tf.zeros_like(mask[:, :offset]), mask[:, :-offset]], axis=1)
+            [tf.zeros_like(mask[:, :offset]), mask[:, :-offset]], axis=1
+        )
 
         # 0011100 > 0001110 => 0010000
         left_boundary = tf.greater(
-            tf.cast(mask, tf.int32), tf.cast(right_shifted_mask, tf.int32))
+            tf.cast(mask, tf.int32), tf.cast(right_shifted_mask, tf.int32)
+        )
         # left_boundary = tf.greater(mask, right_shifted_mask)
 
         return left_boundary
@@ -335,69 +337,62 @@ def expand_scalar_to_3d(x):
         end = expand_scalar_to_3d(end)
         if mask is None:
             potentials = tf.concat(
-                [potentials[:, :1, :] + start, potentials[:, 1:, :]], axis=1)
+                [potentials[:, :1, :] + start, potentials[:, 1:, :]], axis=1
+            )
             potentials = tf.concat(
-                [potentials[:, :-1, :], potentials[:, -1:, :] + end], axis=1)
-        else:
-            mask = tf.keras.backend.expand_dims(
-                tf.cast(mask, start.dtype), axis=-1)
-            start_mask = tf.cast(
-                self._compute_mask_left_boundary(mask),
-                start.dtype,
+                [potentials[:, :-1, :], potentials[:, -1:, :] + end], axis=1
             )
+        else:
+            mask = tf.keras.backend.expand_dims(tf.cast(mask, start.dtype), axis=-1)
+            start_mask = tf.cast(self._compute_mask_left_boundary(mask), start.dtype)
 
-            end_mask = tf.cast(
-                self._compute_mask_right_boundary(mask),
-                end.dtype,
-            )
+            end_mask = tf.cast(self._compute_mask_right_boundary(mask), end.dtype)
             potentials = potentials + start_mask * start
             potentials = potentials + end_mask * end
         return potentials
 
     def get_viterbi_decoding(self, potentials, sequence_length):
         # decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`
-        decode_tags, best_score = crf_decode(potentials, self.chain_kernel,
-                                             sequence_length)
+        decode_tags, best_score = crf_decode(
+            potentials, self.chain_kernel, sequence_length
+        )
 
         return decode_tags, best_score
 
     def get_config(self):
         # used for loading model from disk
         config = {
-            "units":
-            self.units,
-            "use_boundary":
-            self.use_boundary,
-            "use_bias":
-            self.use_bias,
-            "use_kernel":
-            self.use_kernel,
-            "kernel_initializer":
-            tf.keras.initializers.serialize(self.kernel_initializer),
-            "chain_initializer":
-            tf.keras.initializers.serialize(self.chain_initializer),
-            "boundary_initializer":
-            tf.keras.initializers.serialize(self.boundary_initializer),
-            "bias_initializer":
-            tf.keras.initializers.serialize(self.bias_initializer),
-            "activation":
-            tf.keras.activations.serialize(self.activation),
-            "kernel_regularizer":
-            tf.keras.regularizers.serialize(self.kernel_regularizer),
-            "chain_regularizer":
-            tf.keras.regularizers.serialize(self.chain_regularizer),
-            "boundary_regularizer":
-            tf.keras.regularizers.serialize(self.boundary_regularizer),
-            "bias_regularizer":
-            tf.keras.regularizers.serialize(self.bias_regularizer),
-            "kernel_constraint":
-            tf.keras.constraints.serialize(self.kernel_constraint),
-            "chain_constraint":
-            tf.keras.constraints.serialize(self.chain_constraint),
-            "boundary_constraint":
-            tf.keras.constraints.serialize(self.boundary_constraint),
-            "bias_constraint":
-            tf.keras.constraints.serialize(self.bias_constraint)
+            "units": self.units,
+            "use_boundary": self.use_boundary,
+            "use_bias": self.use_bias,
+            "use_kernel": self.use_kernel,
+            "kernel_initializer": tf.keras.initializers.serialize(
+                self.kernel_initializer
+            ),
+            "chain_initializer": tf.keras.initializers.serialize(
+                self.chain_initializer
+            ),
+            "boundary_initializer": tf.keras.initializers.serialize(
+                self.boundary_initializer
+            ),
+            "bias_initializer": tf.keras.initializers.serialize(self.bias_initializer),
+            "activation": tf.keras.activations.serialize(self.activation),
+            "kernel_regularizer": tf.keras.regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "chain_regularizer": tf.keras.regularizers.serialize(
+                self.chain_regularizer
+            ),
+            "boundary_regularizer": tf.keras.regularizers.serialize(
+                self.boundary_regularizer
+            ),
+            "bias_regularizer": tf.keras.regularizers.serialize(self.bias_regularizer),
+            "kernel_constraint": tf.keras.constraints.serialize(self.kernel_constraint),
+            "chain_constraint": tf.keras.constraints.serialize(self.chain_constraint),
+            "boundary_constraint": tf.keras.constraints.serialize(
+                self.boundary_constraint
+            ),
+            "bias_constraint": tf.keras.constraints.serialize(self.bias_constraint),
         }
         base_config = super(CRF, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -415,7 +410,8 @@ def get_negative_log_likelihood(self, y_true):
         self.sequence_length = tf.cast(self.sequence_length, tf.int32)
 
         log_likelihood, _ = crf_log_likelihood(
-            self.potentials, y_true, self.sequence_length, self.chain_kernel)
+            self.potentials, y_true, self.sequence_length, self.chain_kernel
+        )
 
         return -log_likelihood
 
@@ -429,14 +425,14 @@ def get_accuracy(self, y_true, y_pred):
             return tf.reduce_mean(judge)
         else:
             mask = tf.cast(self.mask, tf.keras.backend.floatx())
-            return (tf.reduce_sum(judge * mask) / tf.reduce_sum(mask))
+            return tf.reduce_sum(judge * mask) / tf.reduce_sum(mask)
 
     def __call__(self, inputs, *args, **kwargs):
         outputs = super(CRF, self).__call__(inputs, *args, **kwargs)
 
         # A hack that add _keras_history to EagerTensor, make it more like normal Tensor
         for tensor in tf.nest.flatten(outputs):
-            if not hasattr(tensor, '_keras_history'):
+            if not hasattr(tensor, "_keras_history"):
                 tensor._keras_history = (self, 0, 0)
 
         return outputs
diff --git a/tensorflow_addons/layers/crf_test.py b/tensorflow_addons/layers/crf_test.py
index 12af77b74d..9910b391be 100644
--- a/tensorflow_addons/layers/crf_test.py
+++ b/tensorflow_addons/layers/crf_test.py
@@ -14,12 +14,11 @@
 # ==============================================================================
 """Tests for Conditional Random Field layer."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import numpy as np
 import tensorflow as tf
+
 from tensorflow_addons.layers.crf import CRF
 from tensorflow_addons.utils import test_utils
 
@@ -54,16 +53,11 @@ def test_unmasked_viterbi_decode(self):
         test_utils.layer_test(
             CRF,
             kwargs={
-                "units":
-                5,
-                "use_kernel":
-                False,  # disable kernel transform
-                "chain_initializer":
-                tf.keras.initializers.Constant(transitions),
-                "use_boundary":
-                True,
-                "boundary_initializer":
-                tf.keras.initializers.Constant(boundary_value),
+                "units": 5,
+                "use_kernel": False,  # disable kernel transform
+                "chain_initializer": tf.keras.initializers.Constant(transitions),
+                "use_boundary": True,
+                "boundary_initializer": tf.keras.initializers.Constant(boundary_value),
             },
             input_data=x,
             expected_output=expected_y,
diff --git a/tensorflow_addons/losses/crf.py b/tensorflow_addons/losses/crf.py
index 027b8e0d9e..270b9329d7 100644
--- a/tensorflow_addons/losses/crf.py
+++ b/tensorflow_addons/losses/crf.py
@@ -14,9 +14,7 @@
 # ==============================================================================
 """Implementing Conditional Random Field loss."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import tensorflow as tf
 
@@ -36,8 +34,9 @@ def __call__(self, y_true, y_pred, sample_weight=None):
 
         # check if last layer is CRF
         if not isinstance(crf_layer, CRF):
-            raise ValueError("Last layer must be CRF for use {}.".format(
-                self.__class__.__name__))
+            raise ValueError(
+                "Last layer must be CRF for use {}.".format(self.__class__.__name__)
+            )
 
         loss_vector = crf_layer.get_loss(y_true, y_pred)
 
@@ -58,8 +57,7 @@ def crf_loss(y_true, y_pred):
 
     # check if last layer is CRF
     if not isinstance(crf_layer, CRF):
-        raise ValueError(
-            "Last layer must be CRF for use {}.".format("crf_loss"))
+        raise ValueError("Last layer must be CRF for use {}.".format("crf_loss"))
 
     loss_vector = crf_layer.get_loss(y_true, y_pred)
 
diff --git a/tensorflow_addons/losses/crf_test.py b/tensorflow_addons/losses/crf_test.py
index a2279ec843..bd5e554bea 100644
--- a/tensorflow_addons/losses/crf_test.py
+++ b/tensorflow_addons/losses/crf_test.py
@@ -1,4 +1,4 @@
-## Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,28 +14,27 @@
 # ==============================================================================
 """Tests for Conditional Random Field loss."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import itertools
 import math
 import os
 
 import numpy as np
-import tensorflow as tf
 import six
+import tensorflow as tf
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.util import nest
 
 from tensorflow_addons.layers.crf import CRF
 from tensorflow_addons.losses import crf
 from tensorflow_addons.utils import test_utils
-from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.framework import tensor_util
+
 if six.PY3:
     from unittest.mock import patch
 else:
     from mock import patch
-from tensorflow.python.util import nest
 
 # TODO(howl-anderson):  test CRF as the first layer
 
@@ -45,19 +44,23 @@ class ConditionalRandomFieldLossTest(tf.test.TestCase):
     def setUp(self):
         super(ConditionalRandomFieldLossTest, self).setUp()
 
-        self.logits = np.array([
-            [[0, 0, 0.5, 0.5, 0.2], [0, 0, 0.3, 0.3, 0.1], [0, 0, 0.9, 10, 1]],
-            [[0, 0, 0.2, 0.5, 0.2], [0, 0, 3, 0.3, 0.1], [0, 0, 0.9, 1, 1]],
-        ])
+        self.logits = np.array(
+            [
+                [[0, 0, 0.5, 0.5, 0.2], [0, 0, 0.3, 0.3, 0.1], [0, 0, 0.9, 10, 1]],
+                [[0, 0, 0.2, 0.5, 0.2], [0, 0, 3, 0.3, 0.1], [0, 0, 0.9, 1, 1]],
+            ]
+        )
         self.tags = np.array([[2, 3, 4], [3, 2, 2]])
 
-        self.transitions = np.array([
-            [0.1, 0.2, 0.3, 0.4, 0.5],
-            [0.8, 0.3, 0.1, 0.7, 0.9],
-            [-0.3, 2.1, -5.6, 3.4, 4.0],
-            [0.2, 0.4, 0.6, -0.3, -0.4],
-            [1.0, 1.0, 1.0, 1.0, 1.0],
-        ])
+        self.transitions = np.array(
+            [
+                [0.1, 0.2, 0.3, 0.4, 0.5],
+                [0.8, 0.3, 0.1, 0.7, 0.9],
+                [-0.3, 2.1, -5.6, 3.4, 4.0],
+                [0.2, 0.4, 0.6, -0.3, -0.4],
+                [1.0, 1.0, 1.0, 1.0, 1.0],
+            ]
+        )
 
         self.boundary_values = np.ones((5,))
 
@@ -67,8 +70,7 @@ def setUp(self):
             use_kernel=False,  # disable kernel transform
             chain_initializer=tf.keras.initializers.Constant(self.transitions),
             use_boundary=True,
-            boundary_initializer=tf.keras.initializers.Constant(
-                self.boundary_values),
+            boundary_initializer=tf.keras.initializers.Constant(self.boundary_values),
             name="crf_layer",
         )
 
@@ -100,8 +102,7 @@ def compute_log_likelihood(self):
                 self.score(logits_i, tags_j)
                 for tags_j in itertools.product(range(5), repeat=3)
             ]
-            denominator = math.log(
-                sum(math.exp(score) for score in all_scores))
+            denominator = math.log(sum(math.exp(score) for score in all_scores))
             # And include them in the manual calculation.
             manual_log_likelihood += numerator - denominator
 
@@ -115,7 +116,8 @@ def test_loss_function(self):
         model.compile(
             "adam",
             loss=crf.ConditionalRandomFieldLoss(),
-            metrics=[tf.keras.metrics.Accuracy()])
+            metrics=[tf.keras.metrics.Accuracy()],
+        )
 
         log_likelihood, _ = model.train_on_batch(self.logits, self.tags)
 
@@ -133,22 +135,21 @@ def test_model_fit(self):
         model.compile(
             "adam",
             loss=crf.ConditionalRandomFieldLoss(),
-            metrics=[tf.keras.metrics.Accuracy()])
+            metrics=[tf.keras.metrics.Accuracy()],
+        )
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
     def test_dump_and_load(self):
         tmp_dir = self.get_temp_dir()
-        MODEL_PERSISTENCE_PATH = os.path.join(tmp_dir,
-                                              'test_saving_crf_model.h5')
+        MODEL_PERSISTENCE_PATH = os.path.join(tmp_dir, "test_saving_crf_model.h5")
 
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
         model.compile(
-            "adam",
-            loss="Addons>crf_loss",
-            metrics=[tf.keras.metrics.Accuracy()])
+            "adam", loss="Addons>crf_loss", metrics=[tf.keras.metrics.Accuracy()]
+        )
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
@@ -195,13 +196,14 @@ def test_mask_left_padding(self):
 
         # check shape inference
         model = tf.keras.models.Model(x, y)
-        model.compile('adam', crf.ConditionalRandomFieldLoss())
+        model.compile("adam", crf.ConditionalRandomFieldLoss())
 
         with self.assertRaises(tf.errors.InvalidArgumentError) as context:
             model.fit(train_x, train_y)
 
-        self.assertTrue("CRF layer do not support left padding" in
-                        context.exception.message)
+        self.assertTrue(
+            "CRF layer do not support left padding" in context.exception.message
+        )
 
     def test_mask_right_padding(self):
         train_x = np.array(
@@ -234,7 +236,7 @@ def test_mask_right_padding(self):
 
         # check shape inference
         model = tf.keras.models.Model(x, y)
-        model.compile('adam', crf.ConditionalRandomFieldLoss())
+        model.compile("adam", crf.ConditionalRandomFieldLoss())
         model.fit(train_x, train_y)
 
     def test_in_subclass_model(self):
@@ -271,19 +273,18 @@ def _mark_as_return(tensor):
 
                 # pylint: disable=protected-access
                 return_tensor = acd.mark_as_return(tensor)
-                if getattr(tensor, '_keras_mask', None) is not None:
-                    return_tensor._keras_mask = acd.mark_as_return(
-                        tensor._keras_mask)
+                if getattr(tensor, "_keras_mask", None) is not None:
+                    return_tensor._keras_mask = acd.mark_as_return(tensor._keras_mask)
                 else:
                     return_tensor._keras_mask = None
 
                 # TODO(howl-anderson) a little hack here, handle _keras_history
-                if getattr(tensor, '_keras_history', None) is not None:
+                if getattr(tensor, "_keras_history", None) is not None:
                     return_tensor._keras_history = tensor._keras_history
 
                 # Handle TensorFlow Probability attached metadata.
                 # TODO(b/132076537): Remove this once TFP uses `CompositeTensor`.
-                if getattr(tensor, '_tfp_distribution', None) is not None:
+                if getattr(tensor, "_tfp_distribution", None) is not None:
                     return_tensor._tfp_distribution = tensor._tfp_distribution
 
                 return return_tensor
@@ -300,22 +301,20 @@ def __init__(self):
             def call(self, inputs):
                 return self.layer(inputs)
 
-            @patch.object(base_layer_utils, 'mark_as_return',
-                          patch_mark_as_return)
+            @patch.object(base_layer_utils, "mark_as_return", patch_mark_as_return)
             def __call__(self, inputs, *args, **kwargs):
-                outputs = super(CRFModel, self).__call__(
-                    inputs, *args, **kwargs)
+                outputs = super(CRFModel, self).__call__(inputs, *args, **kwargs)
 
                 # A hack that add _keras_history to EagerTensor, make it more like normal Tensor
                 for tensor in tf.nest.flatten(outputs):
-                    if not hasattr(tensor, '_keras_history'):
+                    if not hasattr(tensor, "_keras_history"):
                         tensor._keras_history = (self, 0, 0)
 
                 return outputs
 
         model = CRFModel()
 
-        model.compile('adam', crf.ConditionalRandomFieldLoss())
+        model.compile("adam", crf.ConditionalRandomFieldLoss())
         model.fit(train_x, train_y)
 
     def test_serialization(self, dtype=None):
@@ -326,7 +325,8 @@ def test_serialization(self, dtype=None):
 
     def test_keras_model_compile(self):
         model = tf.keras.models.Sequential(
-            [tf.keras.layers.Input(shape=(3, 5)), self.crf])
+            [tf.keras.layers.Input(shape=(3, 5)), self.crf]
+        )
 
         model.compile(loss="Addons>crf_loss", optimizer="adam")
 

From 78c390f7a16435ae704a02cdfd2fde0589fa34f4 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 12 Feb 2020 15:11:48 +0800
Subject: [PATCH 41/52] Remove useless file

---
 design_docs/crf_usage.py | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 design_docs/crf_usage.py

diff --git a/design_docs/crf_usage.py b/design_docs/crf_usage.py
deleted file mode 100644
index d090113183..0000000000
--- a/design_docs/crf_usage.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from tensorflow import keras
-
-keras.models.load_model('path_to_my_model.h5')

From 2f255707aa6de1d9d35febab1e40f615fb28e4f3 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 12 Feb 2020 16:33:54 +0800
Subject: [PATCH 42/52] CI: rerun


From ab2ad100ef2be8122b472cd2682ba68e22f782f4 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Thu, 13 Feb 2020 10:33:03 +0800
Subject: [PATCH 43/52] Add typing hint

---
 tensorflow_addons/layers/crf.py  | 37 +++++++++++++++++---------------
 tensorflow_addons/losses/crf.py  |  5 +++--
 tensorflow_addons/utils/types.py |  2 +-
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 305ce424d2..3a4bd5b3df 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -19,8 +19,10 @@
 from __future__ import absolute_import, division, print_function
 
 import tensorflow as tf
+from typeguard import typechecked
 
 from tensorflow_addons.text.crf import crf_decode, crf_log_likelihood
+from tensorflow_addons.utils import types
 
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
@@ -101,25 +103,26 @@ class CRF(tf.keras.layers.Layer):
         - [Conditional Random Field](https://en.wikipedia.org/wiki/Conditional_random_field)
     """
 
+    @typechecked
     def __init__(
         self,
-        units,
-        chain_initializer="orthogonal",
-        chain_regularizer=None,
-        chain_constraint=None,
-        use_boundary=True,
-        boundary_initializer="zeros",
-        boundary_regularizer=None,
-        boundary_constraint=None,
-        use_kernel=True,
-        kernel_initializer="glorot_uniform",
-        kernel_regularizer=None,
-        kernel_constraint=None,
-        use_bias=True,
-        bias_initializer="zeros",
-        bias_regularizer=None,
-        bias_constraint=None,
-        activation="linear",
+        units: int,
+        chain_initializer: types.Initializer = "orthogonal",
+        chain_regularizer: types.Regularizer = None,
+        chain_constraint: types.Constraint = None,
+        use_boundary: bool = True,
+        boundary_initializer: types.Initializer = "zeros",
+        boundary_regularizer: types.Regularizer = None,
+        boundary_constraint: types.Constraint = None,
+        use_kernel: bool = True,
+        kernel_initializer: types.Initializer = "glorot_uniform",
+        kernel_regularizer: types.Regularizer = None,
+        kernel_constraint: types.Constraint = None,
+        use_bias: bool = True,
+        bias_initializer: types.Initializer = "zeros",
+        bias_regularizer: types.Regularizer = None,
+        bias_constraint: types.Constraint = None,
+        activation: types.Activation = "linear",
         **kwargs,
     ):
         super(CRF, self).__init__(**kwargs)
diff --git a/tensorflow_addons/losses/crf.py b/tensorflow_addons/losses/crf.py
index 270b9329d7..e748dffd68 100644
--- a/tensorflow_addons/losses/crf.py
+++ b/tensorflow_addons/losses/crf.py
@@ -19,11 +19,12 @@
 import tensorflow as tf
 
 from tensorflow_addons.layers.crf import CRF
+from tensorflow_addons.utils import types
 
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
 class ConditionalRandomFieldLoss(object):
-    def __init__(self, name="crf_loss"):
+    def __init__(self, name: str = "crf_loss"):
         self.name = name
 
     def get_config(self):
@@ -44,7 +45,7 @@ def __call__(self, y_true, y_pred, sample_weight=None):
 
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
-def crf_loss(y_true, y_pred):
+def crf_loss(y_true: types.TensorLike, y_pred: types.TensorLike) -> tf.Tensor:
     """
     Args
         y_true: true targets tensor.
diff --git a/tensorflow_addons/utils/types.py b/tensorflow_addons/utils/types.py
index aca916bb59..ea6c85286e 100644
--- a/tensorflow_addons/utils/types.py
+++ b/tensorflow_addons/utils/types.py
@@ -41,6 +41,6 @@
 Constraint = Union[None, dict, str, Callable]
 Activation = Union[None, str, Callable]
 
-TensorLike = Union[List[Union[Number, list]], tuple, Number, np.ndarray, tf.Tensor]
+TensorLike = Union[List[Union[Number, list]], tuple, Number, np.ndarray, tf.Tensor, tf.Variable]
 FloatTensorLike = Union[tf.Tensor, float, np.float16, np.float32, np.float64]
 AcceptableDTypes = Union[tf.DType, np.dtype, type, int, str, None]

From e3d3cfe877915b250f14f4032c409f7ece3040ff Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Thu, 13 Feb 2020 11:15:40 +0800
Subject: [PATCH 44/52] Fix code style

---
 tensorflow_addons/layers/crf.py  | 2 +-
 tensorflow_addons/utils/types.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 3a4bd5b3df..047e2f9eeb 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -123,7 +123,7 @@ def __init__(
         bias_regularizer: types.Regularizer = None,
         bias_constraint: types.Constraint = None,
         activation: types.Activation = "linear",
-        **kwargs,
+        **kwargs
     ):
         super(CRF, self).__init__(**kwargs)
 
diff --git a/tensorflow_addons/utils/types.py b/tensorflow_addons/utils/types.py
index ea6c85286e..f6fda9fc7d 100644
--- a/tensorflow_addons/utils/types.py
+++ b/tensorflow_addons/utils/types.py
@@ -41,6 +41,8 @@
 Constraint = Union[None, dict, str, Callable]
 Activation = Union[None, str, Callable]
 
-TensorLike = Union[List[Union[Number, list]], tuple, Number, np.ndarray, tf.Tensor, tf.Variable]
+TensorLike = Union[
+    List[Union[Number, list]], tuple, Number, np.ndarray, tf.Tensor, tf.Variable
+]
 FloatTensorLike = Union[tf.Tensor, float, np.float16, np.float32, np.float64]
 AcceptableDTypes = Union[tf.DType, np.dtype, type, int, str, None]

From 42f6b335c091197c796550d573edc89cb43dad90 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Fri, 14 Feb 2020 11:13:43 +0800
Subject: [PATCH 45/52] fix TODOs

---
 tensorflow_addons/layers/crf.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow_addons/layers/crf.py b/tensorflow_addons/layers/crf.py
index 047e2f9eeb..29acd7a8cf 100644
--- a/tensorflow_addons/layers/crf.py
+++ b/tensorflow_addons/layers/crf.py
@@ -298,16 +298,15 @@ def _compute_mask_right_boundary(mask):
             [mask[:, offset:], tf.zeros_like(mask[:, :offset])], axis=1
         )
 
-        # TODO(howl-anderson): for below code
+        # NOTE: below code is different from keras_contrib
         # Original code in keras_contrib:
         # end_mask = K.cast(
         #   K.greater(self.shift_left(mask), mask),
         #   K.floatx()
         # )
-        # May have a bug, it's better confirmed
+        # has a bug, confirmed
         # by the original keras_contrib maintainer
         # Luiz Felix (github: lzfelix),
-        # mailed him already and waiting for reply.
 
         # 0011100 > 0111000 => 0000100
         right_boundary = tf.greater(mask, left_shifted_mask)

From d21707c5e28ef662473c33934fe1f0d90a82f15f Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 25 Feb 2020 14:21:40 +0800
Subject: [PATCH 46/52] Update doc

---
 design_docs/crf.md | 66 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 54 insertions(+), 12 deletions(-)

diff --git a/design_docs/crf.md b/design_docs/crf.md
index c3e4433ca0..ee3a1af2bd 100644
--- a/design_docs/crf.md
+++ b/design_docs/crf.md
@@ -64,7 +64,9 @@ easy to implement and no more need patch
 
 #### cons ####
 
-This solution has a shortage that this model can not be save and load from disk anymore.
+This solution has a shortage that load model from disk will be difficult.
+
+##### TensorFlow's default load process don't work #####
 
 ```python
 # Save the model
@@ -74,13 +76,35 @@ model.save('path_to_my_model.h5')
 new_model = keras.models.load_model('path_to_my_model.h5')
 ```
 
-key code snippet of how load loss from disk (as h5 file) in the function `tensorflow_core.python.keras.saving.saving_utils.compile_args_from_training_config`.
+The reason is when Keras core reconstruct the model from disk, it will construct layer and loss from disk independently, so the new loss instance don't have the reference to the new CRF layer instance, therefore the loss function don't work anymore.
+
+##### A workaround solution (not prefect) #####
+TODO: add a PoC code for this
 
-loss function must be a class or function that can load from default losses, global custom losses registry or custom_objects passed by user.
+This a workaround solution for loading CRF model from disk.
 
-Since the layer object was constructed in side the `load_model` function, there is no way to pass a loss object generated from a layer object though custom_objects.
+1. Load the model without compile
+```python
+new_model = keras.models.load_model('path_to_my_model.h5', compile=Flase)
+```
+
+2. Get the CRF layer instance
+```python
+# normally, crf layer is the last layer
+crf_layer_instance = new_model.get_layer(index=-1)
+```
+
+3. Get the CRF loss instance from layer instance
+```python
+crf_loss_instance = crf_layer_instance.get_keras_loss()
+```
+
+4. Compile the model
+```python
+new_model.compile(loss=crf_loss_instance)
+```
 
-Also I think it even can not be saved to disk. TODO(howl-anderson): add more detailed code later.
+The shortage of this method is user need to add extract code to load the model and all the arguments except the loss passed to model's compile method before will not longer remembered, user need to pass to it again (if their still remember it)
 
 ## About CRF loss
 
@@ -90,8 +114,7 @@ Also I think it even can not be saved to disk. TODO(howl-anderson): add more det
 the recommended way to implement a "normal" loss
 
 #### cons
-
-according to the code around tensorflow_core/python/keras/engine/training.py:1651 
+according to the code around `tensorflow_core/python/keras/engine/training.py:1651`
 `per_sample_losses` returned by `loss_fn.call(y_true, y_pred)` must (or can be converted to) have the same shape with `sample_weight` which default to output `mask` (tensorflow_core/python/keras/engine/training.py:1642) of CRF layer.
 
 but that is not possible because `per_sample_losses` is a 1d tensor and `mask` of CRF is a 2d tensor.
@@ -100,12 +123,31 @@ One way to fix it is set output `mark` of crf layer to a 1d tensor, which make t
 
 Other way is modified the output of loss class to make `per_sample_losses` to a 2d tensor and properly set the reduce property of the class. It so wired and break the semantic meaning of the interface, should considered to a bad idea.
 
-### Solution 2: implement loss as a function
 
-#### pros
+### Solution 2: implement as a function ###
 
-easy to implement and nothing breaks. `mark` property is still a meaningful tensor which standard as a mark.
+#### pros ####
+This is a old but standard (keras style) way to implement the loss function
 
-#### cons
+#### cons ####
+TensorFlow will convert a loss function into a subclass of `tf.keras.losses.Loss` in `` file by `` (call chain: `tf.keras.Model::compile()` [Line: 314] > `tensorflow/python/keras/engine/training_utils.py::prepare_loss_functions` [Line: 1501] > `tensorflow/python/keras/engine/training_utils.py::get_loss_function` [Line: 1186]).
 
-this is a old style way to implement a loss function, which is not the recommend way in TF 2.x.
\ No newline at end of file
+```python
+  # For losses which are given as strings/functions in the compile API,
+  # we always set the loss reduction type to be `SUM_OVER_BATCH_SIZE`
+  # (both in distribution strategy context and otherwise).
+  return losses.LossFunctionWrapper(
+      loss_fn,
+      name=loss_fn.__name__,
+      reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)
+```
+
+So it has same issue that solution 1.
+
+### Solution 3: implement loss as a callable class
+
+#### pros
+Nothing breaks. `mark` property is still a meaningful tensor which standard as a mark.
+
+#### cons
+this solution need understanding how keras process a loss function, which is not documented and not recommend way in TF 2.x.

From 40043102ce0ffccd10cc3c4731ca55f533cf1362 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Tue, 25 Feb 2020 14:18:06 +0800
Subject: [PATCH 47/52] Update loss and loss tests

---
 tensorflow_addons/losses/crf.py      |  20 +-
 tensorflow_addons/losses/crf_test.py | 376 ++++++++++++++-------------
 2 files changed, 199 insertions(+), 197 deletions(-)

diff --git a/tensorflow_addons/losses/crf.py b/tensorflow_addons/losses/crf.py
index e748dffd68..1f96b4606b 100644
--- a/tensorflow_addons/losses/crf.py
+++ b/tensorflow_addons/losses/crf.py
@@ -44,22 +44,4 @@ def __call__(self, y_true, y_pred, sample_weight=None):
         return tf.keras.backend.mean(loss_vector)
 
 
-@tf.keras.utils.register_keras_serializable(package="Addons")
-def crf_loss(y_true: types.TensorLike, y_pred: types.TensorLike) -> tf.Tensor:
-    """
-    Args
-        y_true: true targets tensor.
-        y_pred: predictions tensor.
-
-    Returns:
-        scalar.
-    """
-    crf_layer = y_pred._keras_history[0]
-
-    # check if last layer is CRF
-    if not isinstance(crf_layer, CRF):
-        raise ValueError("Last layer must be CRF for use {}.".format("crf_loss"))
-
-    loss_vector = crf_layer.get_loss(y_true, y_pred)
-
-    return tf.keras.backend.mean(loss_vector)
+crf_loss = ConditionalRandomFieldLoss()
diff --git a/tensorflow_addons/losses/crf_test.py b/tensorflow_addons/losses/crf_test.py
index bd5e554bea..47e58b698d 100644
--- a/tensorflow_addons/losses/crf_test.py
+++ b/tensorflow_addons/losses/crf_test.py
@@ -36,7 +36,7 @@
 else:
     from mock import patch
 
-# TODO(howl-anderson):  test CRF as the first layer
+CRF_LOSS_OBJ_LIST = [crf.crf_loss, crf.ConditionalRandomFieldLoss()]
 
 
 @test_utils.run_all_in_graph_and_eager_modes
@@ -108,16 +108,11 @@ def compute_log_likelihood(self):
 
         return manual_log_likelihood
 
-    def test_loss_function(self):
-
+    def _test_loss_function(self, loss_obj):
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
-        model.compile(
-            "adam",
-            loss=crf.ConditionalRandomFieldLoss(),
-            metrics=[tf.keras.metrics.Accuracy()],
-        )
+        model.compile("adam", loss=loss_obj, metrics=[tf.keras.metrics.Accuracy()])
 
         log_likelihood, _ = model.train_on_batch(self.logits, self.tags)
 
@@ -128,207 +123,232 @@ def test_loss_function(self):
 
         self.assertAllClose(expected_log_likelihood, unbatched_log_likelihood)
 
-    def test_model_fit(self):
-        model = tf.keras.models.Sequential()
-        model.add(tf.keras.layers.Input(shape=(3, 5)))
-        model.add(self.crf)
-        model.compile(
-            "adam",
-            loss=crf.ConditionalRandomFieldLoss(),
-            metrics=[tf.keras.metrics.Accuracy()],
-        )
+    def test_class_loss_function(self):
+        self._test_loss_function(crf.ConditionalRandomFieldLoss())
 
-        model.fit(self.logits, self.tags, epochs=10, batch_size=1)
+    def test_func_loss_function(self):
+        self._test_loss_function(crf.crf_loss)
 
-    def test_dump_and_load(self):
+    def test_model_fit(self):
+        for loss_obj in CRF_LOSS_OBJ_LIST:
+            with self.subTest(loss_obj=loss_obj):
+                model = tf.keras.models.Sequential()
+                model.add(tf.keras.layers.Input(shape=(3, 5)))
+                model.add(self.crf)
+                model.compile(
+                    "adam", loss=loss_obj, metrics=[tf.keras.metrics.Accuracy()]
+                )
+
+                model.fit(self.logits, self.tags, epochs=10, batch_size=1)
+
+    def _test_dump_and_load(self, loss_obj):
         tmp_dir = self.get_temp_dir()
         MODEL_PERSISTENCE_PATH = os.path.join(tmp_dir, "test_saving_crf_model.h5")
 
         model = tf.keras.models.Sequential()
         model.add(tf.keras.layers.Input(shape=(3, 5)))
         model.add(self.crf)
-        model.compile(
-            "adam", loss="Addons>crf_loss", metrics=[tf.keras.metrics.Accuracy()]
-        )
+        model.compile("adam", loss=loss_obj, metrics=[tf.keras.metrics.Accuracy()])
 
         model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
         model.save(MODEL_PERSISTENCE_PATH)
-        new_model = tf.keras.models.load_model(MODEL_PERSISTENCE_PATH)
 
+        # no news is good news
+        new_model = tf.keras.models.load_model(MODEL_PERSISTENCE_PATH)
         new_model.fit(self.logits, self.tags, epochs=10, batch_size=1)
 
-        tf.keras.models.load_model(MODEL_PERSISTENCE_PATH)
-
         try:
             os.remove(MODEL_PERSISTENCE_PATH)
         except OSError:
             pass
 
-    def test_mask_left_padding(self):
-        train_x = np.array(
-            [
-                [
-                    # O   B-X  I-X  B-Y  I-Y
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                    [0.0, 0.0, 1.0, 0.0, 0.0],
-                    [0.0, 0.0, 1.0, 0.0, 0.0],
-                ],
-                [
-                    # O   B-X  I-X  B-Y  I-Y
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                ],
-            ]
-        )  # yapf: disable
-
-        train_y = np.array(
-            [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
-        )  # yapf: disable
-
-        mask = np.array([[0, 1, 1], [1, 1, 1]])
-
-        layer = CRF(5)
+    def test_dump_and_load_with_class_loss(self):
+        # TODO(howl-anderson): wait for the PR merged
+        self.skipTest("require tensorflow/tensorflow#37018 merged")
 
-        x = tf.keras.layers.Input(shape=(3, 5))
-        y = layer(x, mask=tf.constant(mask))
+        self._test_dump_and_load(crf.ConditionalRandomFieldLoss())
 
-        # check shape inference
-        model = tf.keras.models.Model(x, y)
-        model.compile("adam", crf.ConditionalRandomFieldLoss())
-
-        with self.assertRaises(tf.errors.InvalidArgumentError) as context:
-            model.fit(train_x, train_y)
-
-        self.assertTrue(
-            "CRF layer do not support left padding" in context.exception.message
-        )
+    def test_mask_left_padding(self):
+        for loss_obj in CRF_LOSS_OBJ_LIST:
+            with self.subTest(loss_obj=loss_obj):
+                train_x = np.array(
+                    [
+                        [
+                            # O   B-X  I-X  B-Y  I-Y
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                            [0.0, 0.0, 1.0, 0.0, 0.0],
+                            [0.0, 0.0, 1.0, 0.0, 0.0],
+                        ],
+                        [
+                            # O   B-X  I-X  B-Y  I-Y
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                        ],
+                    ]
+                )  # yapf: disable
+
+                train_y = np.array(
+                    [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
+                )  # yapf: disable
+
+                mask = np.array([[0, 1, 1], [1, 1, 1]])
+
+                layer = CRF(5)
+
+                x = tf.keras.layers.Input(shape=(3, 5))
+                y = layer(x, mask=tf.constant(mask))
+
+                # check shape inference
+                model = tf.keras.models.Model(x, y)
+                model.compile("adam", loss_obj)
+
+                with self.assertRaises(tf.errors.InvalidArgumentError) as context:
+                    model.fit(train_x, train_y)
+
+                self.assertTrue(
+                    "CRF layer do not support left padding" in context.exception.message
+                )
 
     def test_mask_right_padding(self):
-        train_x = np.array(
-            [
-                [
-                    # O   B-X  I-X  B-Y  I-Y
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                    [0.0, 0.0, 1.0, 0.0, 0.0],
-                    [0.0, 0.0, 1.0, 0.0, 0.0],
-                ],
-                [
-                    # O   B-X  I-X  B-Y  I-Y
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                ],
-            ]
-        )  # yapf: disable
-
-        train_y = np.array(
-            [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
-        )  # yapf: disable
-
-        mask = np.array([[1, 1, 1], [1, 1, 0]])
-
-        layer = CRF(5)
-
-        x = tf.keras.layers.Input(shape=(3, 5))
-        y = layer(x, mask=tf.constant(mask))
-
-        # check shape inference
-        model = tf.keras.models.Model(x, y)
-        model.compile("adam", crf.ConditionalRandomFieldLoss())
-        model.fit(train_x, train_y)
+        for loss_obj in CRF_LOSS_OBJ_LIST:
+            with self.subTest(loss_obj=loss_obj):
+                train_x = np.array(
+                    [
+                        [
+                            # O   B-X  I-X  B-Y  I-Y
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                            [0.0, 0.0, 1.0, 0.0, 0.0],
+                            [0.0, 0.0, 1.0, 0.0, 0.0],
+                        ],
+                        [
+                            # O   B-X  I-X  B-Y  I-Y
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                        ],
+                    ]
+                )  # yapf: disable
+
+                train_y = np.array(
+                    [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
+                )  # yapf: disable
+
+                mask = np.array([[1, 1, 1], [1, 1, 0]])
+
+                layer = CRF(5)
+
+                x = tf.keras.layers.Input(shape=(3, 5))
+                y = layer(x, mask=tf.constant(mask))
+
+                # check shape inference
+                model = tf.keras.models.Model(x, y)
+                model.compile("adam", loss_obj)
+                model.fit(train_x, train_y)
 
     def test_in_subclass_model(self):
-        train_x = np.array(
-            [
-                [
-                    # O   B-X  I-X  B-Y  I-Y
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                    [0.0, 0.0, 1.0, 0.0, 0.0],
-                    [0.0, 0.0, 1.0, 0.0, 0.0],
-                ],
-                [
-                    # O   B-X  I-X  B-Y  I-Y
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                    [0.0, 1.0, 0.0, 0.0, 0.0],
-                ],
-            ]
-        )  # yapf: disable
-
-        train_y = np.array(
-            [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
-        )  # yapf: disable
-
-        def patch_mark_as_return(outputs, acd):
-            """Marks `outputs` as the return values for automatic control
-            deps."""
-
-            def _mark_as_return(tensor):
-                """Marks `tensor` as the return value for automatic control
-                deps."""
-                if not tensor_util.is_tensor(tensor):
-                    return tensor
-
-                # pylint: disable=protected-access
-                return_tensor = acd.mark_as_return(tensor)
-                if getattr(tensor, "_keras_mask", None) is not None:
-                    return_tensor._keras_mask = acd.mark_as_return(tensor._keras_mask)
-                else:
-                    return_tensor._keras_mask = None
-
-                # TODO(howl-anderson) a little hack here, handle _keras_history
-                if getattr(tensor, "_keras_history", None) is not None:
-                    return_tensor._keras_history = tensor._keras_history
-
-                # Handle TensorFlow Probability attached metadata.
-                # TODO(b/132076537): Remove this once TFP uses `CompositeTensor`.
-                if getattr(tensor, "_tfp_distribution", None) is not None:
-                    return_tensor._tfp_distribution = tensor._tfp_distribution
-
-                return return_tensor
-                # pylint: enable=protected-access
-
-            return nest.map_structure(_mark_as_return, outputs)
-
-        class CRFModel(tf.keras.Model):
-            def __init__(self):
-                super(CRFModel, self).__init__()
-
-                self.layer = CRF(5)
-
-            def call(self, inputs):
-                return self.layer(inputs)
-
-            @patch.object(base_layer_utils, "mark_as_return", patch_mark_as_return)
-            def __call__(self, inputs, *args, **kwargs):
-                outputs = super(CRFModel, self).__call__(inputs, *args, **kwargs)
-
-                # A hack that add _keras_history to EagerTensor, make it more like normal Tensor
-                for tensor in tf.nest.flatten(outputs):
-                    if not hasattr(tensor, "_keras_history"):
-                        tensor._keras_history = (self, 0, 0)
-
-                return outputs
-
-        model = CRFModel()
-
-        model.compile("adam", crf.ConditionalRandomFieldLoss())
-        model.fit(train_x, train_y)
+        for loss_obj in CRF_LOSS_OBJ_LIST:
+            with self.subTest(loss_obj=loss_obj):
+                train_x = np.array(
+                    [
+                        [
+                            # O   B-X  I-X  B-Y  I-Y
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                            [0.0, 0.0, 1.0, 0.0, 0.0],
+                            [0.0, 0.0, 1.0, 0.0, 0.0],
+                        ],
+                        [
+                            # O   B-X  I-X  B-Y  I-Y
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                            [0.0, 1.0, 0.0, 0.0, 0.0],
+                        ],
+                    ]
+                )  # yapf: disable
+
+                train_y = np.array(
+                    [[1, 2, 2], [1, 1, 1]]  # B-X  I-X  I-X  # B-X  B-X  B-X
+                )  # yapf: disable
+
+                def patch_mark_as_return(outputs, acd):
+                    """Marks `outputs` as the return values for automatic control
+                    deps."""
+
+                    def _mark_as_return(tensor):
+                        """Marks `tensor` as the return value for automatic control
+                        deps."""
+                        if not tensor_util.is_tensor(tensor):
+                            return tensor
+
+                        # pylint: disable=protected-access
+                        return_tensor = acd.mark_as_return(tensor)
+                        if getattr(tensor, "_keras_mask", None) is not None:
+                            return_tensor._keras_mask = acd.mark_as_return(
+                                tensor._keras_mask
+                            )
+                        else:
+                            return_tensor._keras_mask = None
+
+                        # TODO(howl-anderson) a little hack here, handle _keras_history
+                        if getattr(tensor, "_keras_history", None) is not None:
+                            return_tensor._keras_history = tensor._keras_history
+
+                        # Handle TensorFlow Probability attached metadata.
+                        # TODO(b/132076537): Remove this once TFP uses `CompositeTensor`.
+                        if getattr(tensor, "_tfp_distribution", None) is not None:
+                            return_tensor._tfp_distribution = tensor._tfp_distribution
+
+                        return return_tensor
+                        # pylint: enable=protected-access
+
+                    return nest.map_structure(_mark_as_return, outputs)
+
+                class CRFModel(tf.keras.Model):
+                    def __init__(self):
+                        super(CRFModel, self).__init__()
+
+                        self.layer = CRF(5)
+
+                    def call(self, inputs):
+                        return self.layer(inputs)
+
+                    @patch.object(
+                        base_layer_utils, "mark_as_return", patch_mark_as_return
+                    )
+                    def __call__(self, inputs, *args, **kwargs):
+                        outputs = super(CRFModel, self).__call__(
+                            inputs, *args, **kwargs
+                        )
+
+                        # A hack that add _keras_history to EagerTensor, make it more like normal Tensor
+                        for tensor in tf.nest.flatten(outputs):
+                            if not hasattr(tensor, "_keras_history"):
+                                tensor._keras_history = (self, 0, 0)
+
+                        return outputs
+
+                model = CRFModel()
+
+                model.compile("adam", loss_obj)
+                model.fit(train_x, train_y)
 
     def test_serialization(self, dtype=None):
-        ref_fn = crf.crf_loss
-        config = tf.keras.losses.serialize(ref_fn)
-        fn = tf.keras.losses.deserialize(config)
-        self.assertEqual(ref_fn, fn)
+        for loss_obj in CRF_LOSS_OBJ_LIST:
+            with self.subTest(loss_obj=loss_obj):
+                ref_fn = loss_obj
+                config = tf.keras.losses.serialize(ref_fn)
+                fn = tf.keras.losses.deserialize(config)
+                self.assertEqual(ref_fn.get_config(), fn.get_config())
 
     def test_keras_model_compile(self):
-        model = tf.keras.models.Sequential(
-            [tf.keras.layers.Input(shape=(3, 5)), self.crf]
-        )
+        for loss_obj in CRF_LOSS_OBJ_LIST:
+            with self.subTest(loss_obj=loss_obj):
+                model = tf.keras.models.Sequential(
+                    [tf.keras.layers.Input(shape=(3, 5)), self.crf]
+                )
 
-        model.compile(loss="Addons>crf_loss", optimizer="adam")
+                model.compile(loss=loss_obj, optimizer="adam")
 
 
 if __name__ == "__main__":

From ec2f8d07bfba2d55f8f15abbc3cbfeb48f118cad Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Wed, 4 Mar 2020 11:09:40 +0000
Subject: [PATCH 48/52] Cleanup of dev files.

---
 create_debug_docker.bash |  3 ---
 run_debug_docker.bash    |  3 ---
 run_test_in_docker.bash  | 13 -------------
 3 files changed, 19 deletions(-)
 delete mode 100755 create_debug_docker.bash
 delete mode 100755 run_debug_docker.bash
 delete mode 100755 run_test_in_docker.bash

diff --git a/create_debug_docker.bash b/create_debug_docker.bash
deleted file mode 100755
index 7f94f66101..0000000000
--- a/create_debug_docker.bash
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-docker run --name tf_addons -it -v ${PWD}:/addons -w /addons gcr.io/tensorflow-testing/nosla-ubuntu16.04-manylinux2010 /bin/bash
diff --git a/run_debug_docker.bash b/run_debug_docker.bash
deleted file mode 100755
index a5007d4512..0000000000
--- a/run_debug_docker.bash
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-docker container start -ai tf_addons
diff --git a/run_test_in_docker.bash b/run_test_in_docker.bash
deleted file mode 100755
index b81fe204aa..0000000000
--- a/run_test_in_docker.bash
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-bazel test -c opt -k \
-      --test_timeout 300,450,1200,3600 \
-      --test_output=all \
-      --run_under=$(readlink -f tools/ci_testing/parallel_gpu_execute.sh) \
-      //tensorflow_addons/layers:crf_test
-
-bazel test -c opt -k \
-      --test_timeout 300,450,1200,3600 \
-      --test_output=all \
-      --run_under=$(readlink -f tools/ci_testing/parallel_gpu_execute.sh) \
-      //tensorflow_addons/losses:crf_test

From a8c0517b06dc7c71f667ad7d559034df51e3c378 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Wed, 4 Mar 2020 11:15:36 +0000
Subject: [PATCH 49/52] Removed empty files.

---
 tensorflow_addons/metrics/crf_accuracy.py | 0
 tensorflow_addons/metrics/marginal_acc.py | 0
 tensorflow_addons/metrics/viterbi_acc.py  | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tensorflow_addons/metrics/crf_accuracy.py
 delete mode 100644 tensorflow_addons/metrics/marginal_acc.py
 delete mode 100644 tensorflow_addons/metrics/viterbi_acc.py

diff --git a/tensorflow_addons/metrics/crf_accuracy.py b/tensorflow_addons/metrics/crf_accuracy.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tensorflow_addons/metrics/marginal_acc.py b/tensorflow_addons/metrics/marginal_acc.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tensorflow_addons/metrics/viterbi_acc.py b/tensorflow_addons/metrics/viterbi_acc.py
deleted file mode 100644
index e69de29bb2..0000000000

From e694349f2bb2170d835f16f0cb75dfb33c524a7d Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Wed, 4 Mar 2020 11:19:26 +0000
Subject: [PATCH 50/52] Compatible only with py3

---
 tensorflow_addons/losses/crf_test.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/tensorflow_addons/losses/crf_test.py b/tensorflow_addons/losses/crf_test.py
index 47e58b698d..7e6e3f9f2f 100644
--- a/tensorflow_addons/losses/crf_test.py
+++ b/tensorflow_addons/losses/crf_test.py
@@ -14,14 +14,11 @@
 # ==============================================================================
 """Tests for Conditional Random Field loss."""
 
-from __future__ import absolute_import, division, print_function
-
 import itertools
 import math
 import os
 
 import numpy as np
-import six
 import tensorflow as tf
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras.engine import base_layer_utils
@@ -31,10 +28,7 @@
 from tensorflow_addons.losses import crf
 from tensorflow_addons.utils import test_utils
 
-if six.PY3:
-    from unittest.mock import patch
-else:
-    from mock import patch
+from unittest.mock import patch
 
 CRF_LOSS_OBJ_LIST = [crf.crf_loss, crf.ConditionalRandomFieldLoss()]
 
@@ -42,7 +36,7 @@
 @test_utils.run_all_in_graph_and_eager_modes
 class ConditionalRandomFieldLossTest(tf.test.TestCase):
     def setUp(self):
-        super(ConditionalRandomFieldLossTest, self).setUp()
+        super().setUp()
 
         self.logits = np.array(
             [
@@ -306,7 +300,7 @@ def _mark_as_return(tensor):
 
                 class CRFModel(tf.keras.Model):
                     def __init__(self):
-                        super(CRFModel, self).__init__()
+                        super().__init__()
 
                         self.layer = CRF(5)
 
@@ -317,7 +311,7 @@ def call(self, inputs):
                         base_layer_utils, "mark_as_return", patch_mark_as_return
                     )
                     def __call__(self, inputs, *args, **kwargs):
-                        outputs = super(CRFModel, self).__call__(
+                        outputs = super().__call__(
                             inputs, *args, **kwargs
                         )
 
@@ -333,7 +327,7 @@ def __call__(self, inputs, *args, **kwargs):
                 model.compile("adam", loss_obj)
                 model.fit(train_x, train_y)
 
-    def test_serialization(self, dtype=None):
+    def test_serialization(self):
         for loss_obj in CRF_LOSS_OBJ_LIST:
             with self.subTest(loss_obj=loss_obj):
                 ref_fn = loss_obj

From 9de6cf06f7c2e34fcd5ae127c8a743e56e6d3888 Mon Sep 17 00:00:00 2001
From: gabrieldemarmiesse <gabrieldemarmiesse@gmail.com>
Date: Wed, 4 Mar 2020 11:34:54 +0000
Subject: [PATCH 51/52] Formatting.

---
 tensorflow_addons/losses/crf_test.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow_addons/losses/crf_test.py b/tensorflow_addons/losses/crf_test.py
index 7e6e3f9f2f..b32dcdec80 100644
--- a/tensorflow_addons/losses/crf_test.py
+++ b/tensorflow_addons/losses/crf_test.py
@@ -311,9 +311,7 @@ def call(self, inputs):
                         base_layer_utils, "mark_as_return", patch_mark_as_return
                     )
                     def __call__(self, inputs, *args, **kwargs):
-                        outputs = super().__call__(
-                            inputs, *args, **kwargs
-                        )
+                        outputs = super().__call__(inputs, *args, **kwargs)
 
                         # A hack that add _keras_history to EagerTensor, make it more like normal Tensor
                         for tensor in tf.nest.flatten(outputs):

From 307678d2773cfc2164c892d0b40f4cc93caaea63 Mon Sep 17 00:00:00 2001
From: Xiaoquan Kong <u1mail2me@gmail.com>
Date: Wed, 25 Mar 2020 13:57:46 +0800
Subject: [PATCH 52/52] Update CODEOWNERS

---
 .github/CODEOWNERS | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 6ce517bc57..e73dc4c651 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -101,3 +101,6 @@
 /tensorflow_addons/text/crf*.py @squadrick
 /tensorflow_addons/text/parse_time*.py @helinwang
 /tensorflow_addons/text/skip_gram*.py @rahulunair
+
+/tensorflow_addons/layers/crf*.py @howl-anderson
+/tensorflow_addons/losses/crf*.py @howl-anderson