nlp_project.py

# -*- coding: utf-8 -*-
"""NLP_Project.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1gNE2BdGURa12U1-2Ai8JZEyXkeAyUXG9

## WordNet Features
"""

import linecache
from itertools import islice
import pandas as pd
from pprint import pprint


class CorpusReader:
  """This class indicates the Reader task for Task 1"""

    # Creates the Dictionaries from the Path of the File

  def create_dicts(self, path):
    """
      Returns 2 Dictionaries. Once for the Line and One For the Relation in the File. 
      This is based on how the Data is organized in the supplied Data Files.
    """
    line_d = {}
    rel_d = {}

    with open(path) as f:
      for line in islice(f, 0, None, 4):
        lister = line.split('"')
        line_number = int(lister[0].split('\t')[0])
        line_d[line_number] = ''.join(str(s) for s in lister[1:])
    
    with open(path) as f:
      for i, line in enumerate(islice(f, 1, None, 4)):
        rel_d[i] = line.split('\n')[0]
    
    return (line_d, rel_d)

  def create_dataframe(self, dictionary_to_convert, cols):

    """
      From a Dictionary which is passed, and the desired column to create, this function
      returns a Dataframe.
    """

    dataframe_converted = pd.DataFrame.from_dict(dictionary_to_convert, orient='index', columns = cols)
    dataframe_converted = dataframe_converted.reset_index()
    dataframe_converted = dataframe_converted.drop(columns=['index'])

    return dataframe_converted


  def parse_data(self, path_to_file):

    """
      Invokes the Create Dict and Create Data Frame Function.
      This function is designed to create the Line and Relation Dataframe
    """

    line_dict, rel_dict = self.create_dicts(path_to_file)
    
    line_df = self.create_dataframe(line_dict, ['line'])
    rel_df = self.create_dataframe(rel_dict, ['relation'])

    line_df['relation'] = rel_df['relation']

    return (line_df, rel_df)

# Task 1

lines = CorpusReader()
ans_df, rel_df = lines.parse_data('/content/test_sentence')

print("The Output of Task 1 is: \n")
print("The Corpus has ", len(ans_df), " sentences\n") 
for index in ans_df.index:
  print("The Parsed Line is             : ", ans_df['line'][index])
  print("The Parsed Line has Relation   : ", ans_df['relation'][index])
  print('\n')

# Task 2

import nltk
import spacy
from spacy import displacy
from nltk.corpus import wordnet as wn
import re

nltk.download('wordnet')
nltk.download('punkt')


nlp = spacy.load("en_core_web_sm")
sp = spacy.load('en_core_web_sm')

class AllTasks(CorpusReader):

  # Adding a column of tokens to the dataframe
    def create_tokens(self, dataframe):

      """

        For A DataFrame with the column 'line', this function will create tokens
        of the words in that line

        These tokens will be added as a New Column Named 'Tokens' in the DataFrame and will be returned

      """

      tokenize_dict = {}
      filtered_token_dict = {}
      iterator = dataframe.to_dict('dict')['line']
      stopWords = ['e1', '/e1', 'e2', '/e2', '<', '>', '<e1>', '</e1>', '<e2>', '</e2>']
      for key, val in iterator.items():
        tokenize_dict[key] = nltk.word_tokenize(val)

      for key, val in tokenize_dict.items():
        all_tokens = []
        filtered_tokens = []
        for i in range(len(val)):
          if val[i] == '<':
            val[i] = ''.join(val[i:i+3])
        
          all_tokens = [e for e in val if e not in ('e1', 'e2', '/e1', '/e2', '>')]
          filtered_tokens = [word for word in val if word not in stopWords]
          filtered_token_dict[key] = ', '.join(str(word) for word in filtered_tokens)
          tokenize_dict[key] = ', '.join(str(s) for s in all_tokens)

      tokenize_dataframe = self.create_dataframe(tokenize_dict, ['token'])
      filtered_tok_dataframe = self.create_dataframe(filtered_token_dict, ['filtered tokens'])
      
      dataframe['tokens'] = tokenize_dataframe['token']
      dataframe['filtered tokens'] = filtered_tok_dataframe['filtered tokens']

      return dataframe

    def create_pos_dep_lemma(self, dataframe, col):
      """

        For A DataFrame with the window created, this function will add the POS and Dep Tags of those words.

        These values will be added as Two Columns Named 'pos' and 'dep' in the DataFrame and will be returned.

      """
      pos_dict = {}
      dep_dict = {}
      lem_dict = {}
      p = []
      d = []
      l = []
      for i, val in enumerate(dataframe[col]):
        s = sp(''.join(val).replace(',', ''))
        for word in s:
          p.append(word.pos_)
          d.append(word.dep_)
          l.append(word.lemma_)
        pos_dict[i] = ', '.join(str(s) for s in p)
        dep_dict[i] = ', '.join(str(s) for s in d)
        lem_dict[i] = ', '.join(str(s) for s in l)
        p = []
        d = []
        l = []
      

      colname1 = col + '_pos' if col in ['e1', 'e2'] else 'pos'
      colname2 = col + '_dep' if col in ['e1', 'e2'] else 'dep'
      colname3 = col + '_lem' if col in ['e1', 'e2'] else 'lem'
      pos_dataframe = self.create_dataframe(pos_dict, [colname1])
      dep_dataframe = self.create_dataframe(dep_dict, [colname2])
      lem_dataframe = self.create_dataframe(lem_dict, [colname3])

      dataframe[colname1] = pos_dataframe[colname1]
      dataframe[colname2] = dep_dataframe[colname2]
      dataframe[colname3] = lem_dataframe[colname3]
      return dataframe


    def create_NER(self, dataframe):

      """

        For A DataFrame with line, this function will extract both the entities.

        These values will be added as Two Columns Named 'e1' and 'e2' in the DataFrame and will be returned.

      """

      dataframe['entities'] = dataframe['line']
      entity_dict = {}
      entity_type = {}

      for i, val in enumerate(dataframe['entities']):
        e1 = re.findall('<e1>(.*?)</e1>', val)
        e2 = re.findall('<e2>(.*?)</e2>', val)
        entity_dict[i+1] = (str(e1[0]), str(e2[0]))
        doc = nlp(e1[0])
        for ent in doc.ents:
          if ent.label_:
            entity_type[i] = ent.label_
          else:
            entity_type[i] = ('NOT RECOGNIZED')
        
        doc = nlp(e2[0])
        for ent in doc.ents:
          if ent.label_:
            entity_type[i] = entity_type[i] + ent.label_
          else:
            entity_type[i] = entity_type[i] + ('NOT RECOGNIZED')

      entity_dataframe = self.create_dataframe(entity_dict, ['e1', 'e2'])
      entity_type_df   = self.create_dataframe(entity_type, ['e1', 'e2'])

      dataframe = dataframe.drop(columns=['entities'])
      dataframe['e1'] = entity_dataframe['e1']
      dataframe['e2'] = entity_dataframe['e2']
      dataframe['e1_type'] = entity_type_df['e1']
      dataframe['e2_type'] = entity_type_df['e2']

      return dataframe

    def print_all_hyps(self, dataframe, col):

      for i, val in enumerate(ans_df[col]):
        val = val.replace(' ', '')
        string = val.split(',')
        for word in string:
          if wn.synsets(word):

            syn = wn.synsets(word)[0]
            print("\n")
            print("Word: ",word)
            print("Holonyms   :", wn.synsets(word)[0].part_holonyms())
            print("Meronyms   :", wn.synsets(word)[0].part_meronyms())
            print("HyperNyms  :", syn.hypernyms())
            print("HypoNyms   :", syn.hyponyms())


    def create_hyper(self, dataframe, col):

      hypernym = {}
      hyper = []
      all_hyper = []

      for i, val in enumerate(ans_df[col]):
        val = val.replace(' ', '')
        string = val.split(',')
        for word in string:
          if wn.synsets(word):

            syn = wn.synsets(word)[0]
            hype = syn.hypernyms()

            if hype:

              for value in hype:
                hyper.append(str(value)[8:-3].split('.')[0])

              all_hyper.append(word + ' : ' + ', '.join(v for v in hyper))
              hyper = []
          
        hypernym[i] = ', '.join(v for v in all_hyper)
        all_hyper = []
      colname = 'hyp'
      hypernym_dataframe = self.create_dataframe(hypernym, [colname])
      dataframe[colname] = hypernym_dataframe[colname]

      return dataframe
# ------------------------------------------------------------------------

    def create_holo(self, dataframe, col):

      holonym = {}
      holo = []
      all_holo = []

      for i, val in enumerate(ans_df[col]):
        val = val.replace(' ', '')
        string = val.split(',')
        for word in string:
          if wn.synsets(word):

            hol = wn.synsets(word)[0].part_holonyms()
            if hol:
              for value in hol:
                holo.append(str(value)[8:-3].split('.')[0])

              all_holo.append(word + ' : ' + ', '.join(v for v in holo))
              hol = []
          
        holonym[i] = ', '.join(v for v in all_holo)
        all_holo = []
      colname = 'holo'
      holonym_dataframe = self.create_dataframe(holonym, [colname])
      dataframe[colname] = holonym_dataframe[colname]

      return dataframe

    def create_mero(self, dataframe, col):

      meronym = {}
      mero = []
      all_mero = []

      for i, val in enumerate(ans_df[col]):
        val = val.replace(' ', '')
        string = val.split(',')
        for word in string:
          if wn.synsets(word):
            mer = wn.synsets(word)[0].part_meronyms()
          
            if mer:
              for value in mer:
                mero.append(str(value)[8:-3].split('.')[0])

              all_mero.append(word + ' : ' + ', '.join(v for v in mero))
              mer = []
        meronym[i] = ', '.join(v for v in all_mero)
        all_mero = []
      colname = 'mero'
      meronym_dataframe = self.create_dataframe(meronym, [colname])
      dataframe[colname] = meronym_dataframe[colname]

      return dataframe


    def create_hypo(self, dataframe, col):

      hyponym = {}
      hypo = []
      all_hypo = []

      for i, val in enumerate(ans_df[col]):
        val = val.replace(' ', '')
        string = val.split(',')
        for word in string:
          if wn.synsets(word):

            syn = wn.synsets(word)[0]
            hyp = syn.hyponyms()

            if hyp:
              for value in hyp:
                hypo.append(str(value)[8:-3].split('.')[0])

              all_hypo.append(word + ' : ' + ', '.join(v for v in hypo))
              hyp = []
        hyponym[i] = ', '.join(v for v in all_hypo)
        all_hypo = []
      colname = 'hypo'
      hyponym_dataframe = self.create_dataframe(hyponym, [colname])
      dataframe[colname] = hyponym_dataframe[colname]

      return dataframe

task2 = AllTasks()
ans_df = task2.create_tokens(ans_df)
for index in ans_df.index:
  print("The Tokens are          : ", ans_df['tokens'][index])
  print("The Filtered Tokens are : ", ans_df['filtered tokens'][index])
  print('\n')

ans_df = task2.create_pos_dep_lemma(ans_df, 'filtered tokens')
for index in ans_df.index:
  print("The Filtered Tokens are     : ", ans_df['filtered tokens'][index], "\n")
  print("The Lemmas are              : ", ans_df['lem'][index], "\n")
  print("The POS Tags are            : ", ans_df['pos'][index], "\n")
  print("The Dependency Parse is     : ", ans_df['dep'][index], "\n")
  print('\n')

ans_df = task2.create_NER(ans_df)

ans_df = task2.create_hyper(ans_df, 'filtered tokens')

ans_df = task2.create_mero(ans_df, 'filtered tokens')

ans_df = task2.create_holo(ans_df, 'filtered tokens')

ans_df = task2.create_hypo(ans_df, 'filtered tokens')

ans_df