research/neural_programmer/wiki_data.py

# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads the WikiQuestions dataset.

An example consists of question, table. Additionally, we store the processed
columns which store the entries after performing number, date and other
preprocessing as done in the baseline.
columns, column names and processed columns are split into word and number
columns.
lookup answer (or matrix) is also split into number and word lookup matrix
Author: aneelakantan (Arvind Neelakantan)
"""
import math
import os
import re
import numpy as np
import unicodedata as ud
import tensorflow as tf

bad_number = -200000.0  #number that is added to a corrupted table entry in a number column

def is_nan_or_inf(number):
  return math.isnan(number) or math.isinf(number)

def strip_accents(s):
  u = unicode(s, "utf-8")
  u_new = ''.join(c for c in ud.normalize('NFKD', u) if ud.category(c) != 'Mn')
  return u_new.encode("utf-8")


def correct_unicode(string):
  string = strip_accents(string)
  string = re.sub("\xc2\xa0", " ", string).strip()
  string = re.sub("\xe2\x80\x93", "-", string).strip()
  #string = re.sub(ur'[\u0300-\u036F]', "", string)
  string = re.sub("â€š", ",", string)
  string = re.sub("â€¦", "...", string)
  #string = re.sub("[Â·ãƒ»]", ".", string)
  string = re.sub("Ë†", "^", string)
  string = re.sub("Ëœ", "~", string)
  string = re.sub("â€¹", "<", string)
  string = re.sub("â€º", ">", string)
  #string = re.sub("[â€˜â€™Â´`]", "'", string)
  #string = re.sub("[â€œâ€Â«Â»]", "\"", string)
  #string = re.sub("[â€¢â€ â€¡]", "", string)
  #string = re.sub("[â€â€‘â€“â€”]", "-", string)
  string = re.sub(ur'[\u2E00-\uFFFF]', "", string)
  string = re.sub("\\s+", " ", string).strip()
  return string


def simple_normalize(string):
  string = correct_unicode(string)
  # Citations
  string = re.sub("\[(nb ?)?\d+\]", "", string)
  string = re.sub("\*+$", "", string)
  # Year in parenthesis
  string = re.sub("\(\d* ?-? ?\d*\)", "", string)
  string = re.sub("^\"(.*)\"$", "", string)
  return string


def full_normalize(string):
  #print "an: ", string
  string = simple_normalize(string)
  # Remove trailing info in brackets
  string = re.sub("\[[^\]]*\]", "", string)
  # Remove most unicode characters in other languages
  string = re.sub(ur'[\u007F-\uFFFF]', "", string.strip())
  # Remove trailing info in parenthesis
  string = re.sub("\([^)]*\)$", "", string.strip())
  string = final_normalize(string)
  # Get rid of question marks
  string = re.sub("\?", "", string).strip()
  # Get rid of trailing colons (usually occur in column titles)
  string = re.sub("\:$", " ", string).strip()
  # Get rid of slashes
  string = re.sub(r"/", " ", string).strip()
  string = re.sub(r"\\", " ", string).strip()
  # Replace colon, slash, and dash with space
  # Note: need better replacement for this when parsing time
  string = re.sub(r"\:", " ", string).strip()
  string = re.sub("/", " ", string).strip()
  string = re.sub("-", " ", string).strip()
  # Convert empty strings to UNK
  # Important to do this last or near last
  if not string:
    string = "UNK"
  return string

def final_normalize(string):
  # Remove leading and trailing whitespace
  string = re.sub("\\s+", " ", string).strip()
  # Convert entirely to lowercase
  string = string.lower()
  # Get rid of strangely escaped newline characters
  string = re.sub("\\\\n", " ", string).strip()
  # Get rid of quotation marks
  string = re.sub(r"\"", "", string).strip()
  string = re.sub(r"\'", "", string).strip()
  string = re.sub(r"`", "", string).strip()
  # Get rid of *
  string = re.sub("\*", "", string).strip()
  return string

def is_number(x):
  try:
    f = float(x)
    return not is_nan_or_inf(f)
  except ValueError:
    return False
  except TypeError:
    return False


class WikiExample(object):

  def __init__(self, id, question, answer, table_key):
    self.question_id = id
    self.question = question
    self.answer = answer
    self.table_key = table_key
    self.lookup_matrix = []
    self.is_bad_example = False
    self.is_word_lookup = False
    self.is_ambiguous_word_lookup = False
    self.is_number_lookup = False
    self.is_number_calc = False
    self.is_unknown_answer = False


class TableInfo(object):

  def __init__(self, word_columns, word_column_names, word_column_indices,
               number_columns, number_column_names, number_column_indices,
               processed_word_columns, processed_number_columns, orig_columns):
    self.word_columns = word_columns
    self.word_column_names = word_column_names
    self.word_column_indices = word_column_indices
    self.number_columns = number_columns
    self.number_column_names = number_column_names
    self.number_column_indices = number_column_indices
    self.processed_word_columns = processed_word_columns
    self.processed_number_columns = processed_number_columns
    self.orig_columns = orig_columns


class WikiQuestionLoader(object):

  def __init__(self, data_name, root_folder):
    self.root_folder = root_folder
    self.data_folder = os.path.join(self.root_folder, "data")
    self.examples = []
    self.data_name = data_name

  def num_questions(self):
    return len(self.examples)

  def load_qa(self):
    data_source = os.path.join(self.data_folder, self.data_name)
    f = tf.gfile.GFile(data_source, "r")
    id_regex = re.compile("\(id ([^\)]*)\)")
    for line in f:
      id_match = id_regex.search(line)
      id = id_match.group(1)
      self.examples.append(id)

  def load(self):
    self.load_qa()


def is_date(word):
  if (not (bool(re.search("[a-z0-9]", word, re.IGNORECASE)))):
    return False
  if (len(word) != 10):
    return False
  if (word[4] != "-"):
    return False
  if (word[7] != "-"):
    return False
  for i in range(len(word)):
    if (not (word[i] == "X" or word[i] == "x" or word[i] == "-" or re.search(
        "[0-9]", word[i]))):
      return False
  return True


class WikiQuestionGenerator(object):

  def __init__(self, train_name, dev_name, test_name, root_folder):
    self.train_name = train_name
    self.dev_name = dev_name
    self.test_name = test_name
    self.train_loader = WikiQuestionLoader(train_name, root_folder)
    self.dev_loader = WikiQuestionLoader(dev_name, root_folder)
    self.test_loader = WikiQuestionLoader(test_name, root_folder)
    self.bad_examples = 0
    self.root_folder = root_folder   
    self.data_folder = os.path.join(self.root_folder, "annotated/data")
    self.annotated_examples = {}
    self.annotated_tables = {}
    self.annotated_word_reject = {}
    self.annotated_word_reject["-lrb-"] = 1
    self.annotated_word_reject["-rrb-"] = 1
    self.annotated_word_reject["UNK"] = 1

  def is_money(self, word):
    if (not (bool(re.search("[a-z0-9]", word, re.IGNORECASE)))):
      return False
    for i in range(len(word)):
      if (not (word[i] == "E" or word[i] == "." or re.search("[0-9]",
                                                             word[i]))):
        return False
    return True

  def remove_consecutive(self, ner_tags, ner_values):
    for i in range(len(ner_tags)):
      if ((ner_tags[i] == "NUMBER" or ner_tags[i] == "MONEY" or
           ner_tags[i] == "PERCENT" or ner_tags[i] == "DATE") and
          i + 1 < len(ner_tags) and ner_tags[i] == ner_tags[i + 1] and
          ner_values[i] == ner_values[i + 1] and ner_values[i] != ""):
        word = ner_values[i]
        word = word.replace(">", "").replace("<", "").replace("=", "").replace(
            "%", "").replace("~", "").replace("$", "").replace("£", "").replace(
                "€", "")
        if (re.search("[A-Z]", word) and not (is_date(word)) and not (
            self.is_money(word))):
          ner_values[i] = "A"
        else:
          ner_values[i] = ","
    return ner_tags, ner_values

  def pre_process_sentence(self, tokens, ner_tags, ner_values):
    sentence = []
    tokens = tokens.split("|")
    ner_tags = ner_tags.split("|")
    ner_values = ner_values.split("|")
    ner_tags, ner_values = self.remove_consecutive(ner_tags, ner_values)
    #print "old: ", tokens
    for i in range(len(tokens)):
      word = tokens[i]
      if (ner_values[i] != "" and
          (ner_tags[i] == "NUMBER" or ner_tags[i] == "MONEY" or
           ner_tags[i] == "PERCENT" or ner_tags[i] == "DATE")):
        word = ner_values[i]
        word = word.replace(">", "").replace("<", "").replace("=", "").replace(
            "%", "").replace("~", "").replace("$", "").replace("£", "").replace(
                "€", "")
        if (re.search("[A-Z]", word) and not (is_date(word)) and not (
            self.is_money(word))):
          word = tokens[i]
        if (is_number(ner_values[i])):
          word = float(ner_values[i])
        elif (is_number(word)):
          word = float(word)
        if (tokens[i] == "score"):
          word = "score"
      if (is_number(word)):
        word = float(word)
      if (not (self.annotated_word_reject.has_key(word))):
        if (is_number(word) or is_date(word) or self.is_money(word)):
          sentence.append(word)
        else:
          word = full_normalize(word)
          if (not (self.annotated_word_reject.has_key(word)) and
              bool(re.search("[a-z0-9]", word, re.IGNORECASE))):
            m = re.search(",", word)
            sentence.append(word.replace(",", ""))
    if (len(sentence) == 0):
      sentence.append("UNK")
    return sentence

  def load_annotated_data(self, in_file):
    self.annotated_examples = {}
    self.annotated_tables = {}
    f = tf.gfile.GFile(in_file, "r")
    counter = 0
    for line in f:
      if (counter > 0):
        line = line.strip()
        (question_id, utterance, context, target_value, tokens, lemma_tokens,
         pos_tags, ner_tags, ner_values, target_canon) = line.split("\t")
        question = self.pre_process_sentence(tokens, ner_tags, ner_values)
        target_canon = target_canon.split("|")
        self.annotated_examples[question_id] = WikiExample(
            question_id, question, target_canon, context)
        self.annotated_tables[context] = []
      counter += 1
    print "Annotated examples loaded ", len(self.annotated_examples)
    f.close()

  def is_number_column(self, a):
    for w in a:
      if (len(w) != 1):
        return False
      if (not (is_number(w[0]))):
        return False
    return True

  def convert_table(self, table):
    answer = []
    for i in range(len(table)):
      temp = []
      for j in range(len(table[i])):
        temp.append(" ".join([str(w) for w in table[i][j]]))
      answer.append(temp)
    return answer

  def load_annotated_tables(self):
    for table in self.annotated_tables.keys():
      annotated_table = table.replace("csv", "annotated")
      orig_columns = []
      processed_columns = []
      f = tf.gfile.GFile(os.path.join(self.root_folder, annotated_table), "r")
      counter = 0
      for line in f:
        if (counter > 0):
          line = line.strip()
          line = line + "\t" * (13 - len(line.split("\t")))
          (row, col, read_id, content, tokens, lemma_tokens, pos_tags, ner_tags,
           ner_values, number, date, num2, read_list) = line.split("\t")
        counter += 1
      f.close()
      max_row = int(row)
      max_col = int(col)
      for i in range(max_col + 1):
        orig_columns.append([])
        processed_columns.append([])
        for j in range(max_row + 1):
          orig_columns[i].append(bad_number)
          processed_columns[i].append(bad_number)
      #print orig_columns
      f = tf.gfile.GFile(os.path.join(self.root_folder, annotated_table), "r")
      counter = 0
      column_names = []
      for line in f:
        if (counter > 0):
          line = line.strip()
          line = line + "\t" * (13 - len(line.split("\t")))
          (row, col, read_id, content, tokens, lemma_tokens, pos_tags, ner_tags,
           ner_values, number, date, num2, read_list) = line.split("\t")
          entry = self.pre_process_sentence(tokens, ner_tags, ner_values)
          if (row == "-1"):
            column_names.append(entry)
          else:
            orig_columns[int(col)][int(row)] = entry
            if (len(entry) == 1 and is_number(entry[0])):
              processed_columns[int(col)][int(row)] = float(entry[0])
            else:
              for single_entry in entry:
                if (is_number(single_entry)):
                  processed_columns[int(col)][int(row)] = float(single_entry)
                  break
              nt = ner_tags.split("|")
              nv = ner_values.split("|")
              for i_entry in range(len(tokens.split("|"))):
                if (nt[i_entry] == "DATE" and
                    is_number(nv[i_entry].replace("-", "").replace("X", ""))):
                  processed_columns[int(col)][int(row)] = float(nv[
                      i_entry].replace("-", "").replace("X", ""))
                  #processed_columns[int(col)][int(row)] =  float(nv[i_entry])
            if (len(entry) == 1 and (is_number(entry[0]) or is_date(entry[0]) or
                                     self.is_money(entry[0]))):
              if (len(entry) == 1 and not (is_number(entry[0])) and
                  is_date(entry[0])):
                entry[0] = entry[0].replace("X", "x")
        counter += 1
      word_columns = []
      processed_word_columns = []
      word_column_names = []
      word_column_indices = []
      number_columns = []
      processed_number_columns = []
      number_column_names = []
      number_column_indices = []
      for i in range(max_col + 1):
        if (self.is_number_column(orig_columns[i])):
          number_column_indices.append(i)
          number_column_names.append(column_names[i])
          temp = []
          for w in orig_columns[i]:
            if (is_number(w[0])):
              temp.append(w[0])
          number_columns.append(temp)
          processed_number_columns.append(processed_columns[i])
        else:
          word_column_indices.append(i)
          word_column_names.append(column_names[i])
          word_columns.append(orig_columns[i])
          processed_word_columns.append(processed_columns[i])
      table_info = TableInfo(
          word_columns, word_column_names, word_column_indices, number_columns,
          number_column_names, number_column_indices, processed_word_columns,
          processed_number_columns, orig_columns)
      self.annotated_tables[table] = table_info
      f.close()

  def answer_classification(self):
    lookup_questions = 0
    number_lookup_questions = 0
    word_lookup_questions = 0
    ambiguous_lookup_questions = 0
    number_questions = 0
    bad_questions = 0
    ice_bad_questions = 0
    tot = 0
    got = 0
    ice = {}
    with tf.gfile.GFile(
        self.root_folder + "/arvind-with-norms-2.tsv", mode="r") as f:
      lines = f.readlines()
      for line in lines:
        line = line.strip()
        if (not (self.annotated_examples.has_key(line.split("\t")[0]))):
          continue
        if (len(line.split("\t")) == 4):
          line = line + "\t" * (5 - len(line.split("\t")))
          if (not (is_number(line.split("\t")[2]))):
            ice_bad_questions += 1
        (example_id, ans_index, ans_raw, process_answer,
         matched_cells) = line.split("\t")
        if (ice.has_key(example_id)):
          ice[example_id].append(line.split("\t"))
        else:
          ice[example_id] = [line.split("\t")]
    for q_id in self.annotated_examples.keys():
      tot += 1
      example = self.annotated_examples[q_id]
      table_info = self.annotated_tables[example.table_key]
      # Figure out if the answer is numerical or lookup
      n_cols = len(table_info.orig_columns)
      n_rows = len(table_info.orig_columns[0])
      example.lookup_matrix = np.zeros((n_rows, n_cols))
      exact_matches = {}
      for (example_id, ans_index, ans_raw, process_answer,
           matched_cells) in ice[q_id]:
        for match_cell in matched_cells.split("|"):
          if (len(match_cell.split(",")) == 2):
            (row, col) = match_cell.split(",")
            row = int(row)
            col = int(col)
            if (row >= 0):
              exact_matches[ans_index] = 1
      answer_is_in_table = len(exact_matches) == len(example.answer)
      if (answer_is_in_table):
        for (example_id, ans_index, ans_raw, process_answer,
             matched_cells) in ice[q_id]:
          for match_cell in matched_cells.split("|"):
            if (len(match_cell.split(",")) == 2):
              (row, col) = match_cell.split(",")
              row = int(row)
              col = int(col)
              example.lookup_matrix[row, col] = float(ans_index) + 1.0
      example.lookup_number_answer = 0.0
      if (answer_is_in_table):
        lookup_questions += 1
        if len(example.answer) == 1 and is_number(example.answer[0]):
          example.number_answer = float(example.answer[0])
          number_lookup_questions += 1
          example.is_number_lookup = True
        else:
          #print "word lookup"
          example.calc_answer = example.number_answer = 0.0
          word_lookup_questions += 1
          example.is_word_lookup = True
      else:
        if (len(example.answer) == 1 and is_number(example.answer[0])):
          example.number_answer = example.answer[0]
          example.is_number_calc = True
        else:
          bad_questions += 1
          example.is_bad_example = True
          example.is_unknown_answer = True
      example.is_lookup = example.is_word_lookup or example.is_number_lookup
      if not example.is_word_lookup and not example.is_bad_example:
        number_questions += 1
        example.calc_answer = example.answer[0]
        example.lookup_number_answer = example.calc_answer
      # Split up the lookup matrix into word part and number part
      number_column_indices = table_info.number_column_indices
      word_column_indices = table_info.word_column_indices
      example.word_columns = table_info.word_columns
      example.number_columns = table_info.number_columns
      example.word_column_names = table_info.word_column_names
      example.processed_number_columns = table_info.processed_number_columns
      example.processed_word_columns = table_info.processed_word_columns
      example.number_column_names = table_info.number_column_names
      example.number_lookup_matrix = example.lookup_matrix[:,
                                                           number_column_indices]
      example.word_lookup_matrix = example.lookup_matrix[:, word_column_indices]

  def load(self):
    train_data = []
    dev_data = []
    test_data = []
    self.load_annotated_data(
        os.path.join(self.data_folder, "training.annotated"))
    self.load_annotated_tables()
    self.answer_classification()
    self.train_loader.load()
    self.dev_loader.load()
    for i in range(self.train_loader.num_questions()):
      example = self.train_loader.examples[i]
      example = self.annotated_examples[example]
      train_data.append(example)
    for i in range(self.dev_loader.num_questions()):
      example = self.dev_loader.examples[i]
      dev_data.append(self.annotated_examples[example])

    self.load_annotated_data(
        os.path.join(self.data_folder, "pristine-unseen-tables.annotated"))
    self.load_annotated_tables()
    self.answer_classification()
    self.test_loader.load()
    for i in range(self.test_loader.num_questions()):
      example = self.test_loader.examples[i]
      test_data.append(self.annotated_examples[example])
    return train_data, dev_data, test_data