add pointer

as-ideas · Jan 26, 2019 · 11f96a9 · 11f96a9
1 parent ed1f2e0
commit 11f96a9
Show file tree

Hide file tree

Showing 16 changed files with 320 additions and 87 deletions.
diff --git a/AttentionUnit.pyc b/AttentionUnit.pyc
diff --git a/DataLoader.pyc b/DataLoader.pyc
diff --git a/LstmUnit.pyc b/LstmUnit.pyc
diff --git a/Main.py b/Main.py
@@ -32,7 +32,7 @@
 
 tf.app.flags.DEFINE_string("mode",'train','train or test')
 tf.app.flags.DEFINE_string("load",'0','load directory') # BBBBBESTOFAll
-tf.app.flags.DEFINE_string("dir",'processed_data','data set directory')
+tf.app.flags.DEFINE_string("dir",'/scratch/home/zhiyu/wiki2bio/processed_data','data set directory')
 tf.app.flags.DEFINE_integer("limits", 0,'max data set size')
 
 
@@ -51,14 +51,20 @@
 
 model_dir = sys.argv[1]
 
-gold_path_test = 'processed_data/test/test_split_for_rouge/gold_summary_'
-gold_path_valid = 'processed_data/valid/valid_split_for_rouge/gold_summary_'
+### path for calculate ROUGE
+# gold_path_test = 'processed_data/test/test_split_for_rouge/gold_summary_'
+# gold_path_valid = 'processed_data/valid/valid_split_for_rouge/gold_summary_'
+
+###
+root_path = "/scratch/home/zhiyu/wiki2bio/"
+gold_path_valid = root_path + 'original_data/valid.summary'
+gold_path_test = root_path + 'original_data/test.summary'
 
 # test phase
 if FLAGS.load != "0":
-    save_dir = 'results/res/' + FLAGS.load + '/'
-    save_file_dir = save_dir + 'files/'
-    pred_dir = 'results/evaluation/' + FLAGS.load + '/'
+    save_dir = root_path + 'results/res/' + model_dir + '/loads/' + FLAGS.load + '/'
+    save_file_dir = root_path + 'results/res/' + model_dir + '/' + 'files/'
+    pred_dir = root_path + 'results/evaluation/' + model_dir + '/' + FLAGS.load + '/'
     if not os.path.exists(pred_dir):
         os.mkdir(pred_dir)
     if not os.path.exists(save_file_dir):
@@ -67,13 +73,12 @@
     pred_beam_path = pred_dir + 'beam_summary_'
 # train phase
 else:
-    prefix = str(int(time.time() * 1000))
-    os.mkdir('results/res/' + model_dir)
-    os.mkdir('results/evaluation/' + model_dir)
-    save_dir = 'results/res/' + model_dir + '/' + prefix + '/'
+    # prefix = str(int(time.time() * 1000))
+    os.mkdir(root_path + 'results/res/' + model_dir)
+    os.mkdir(root_path + 'results/evaluation/' + model_dir)
+    save_dir = root_path + 'results/res/' + model_dir + '/'
     save_file_dir = save_dir + 'files/'
-    pred_dir = 'results/evaluation/' + model_dir + '/' + prefix + '/'
-    os.mkdir(save_dir)
+    pred_dir = root_path + 'results/evaluation/' + model_dir + '/'
     if not os.path.exists(pred_dir):
         os.mkdir(pred_dir)
     if not os.path.exists(save_file_dir):
@@ -101,8 +106,6 @@ def train(sess, dataloader, model):
             record_loss += this_loss
             k += 1
             record_k += 1
-            ksave_dir = save_model(model, save_dir, k // FLAGS.report)
-            write_log(evaluate(sess, dataloader, model, ksave_dir, 'test'))
             progress_bar(k%FLAGS.report, FLAGS.report)
             ### czy
             if (record_k % FLAGS.report_loss == 0):
@@ -111,6 +114,7 @@ def train(sess, dataloader, model):
                 record_loss = 0.0
 
             if (k % FLAGS.report == 0):
+                print "Round: ", k / FLAGS.report
                 cost_time = time.time() - start_time
                 write_log("%d : loss = %.3f, time = %.3f " % (k // FLAGS.report, loss, cost_time))
                 loss, start_time = 0.0, time.time()
@@ -136,12 +140,12 @@ def save_model(model, save_dir, cnt):
 def evaluate(sess, dataloader, model, ksave_dir, mode='valid'):
     if mode == 'valid':
         # texts_path = "original_data/valid.summary"
-        texts_path = "processed_data/valid/valid.box.val"
+        texts_path = root_path + "processed_data/valid/valid.box.val"
         gold_path = gold_path_valid
         evalset = dataloader.dev_set
     else:
         # texts_path = "original_data/test.summary"
-        texts_path = "processed_data/test/test.box.val"
+        texts_path = root_path + "processed_data/test/test.box.val"
         gold_path = gold_path_test
         evalset = dataloader.test_set
 
@@ -187,6 +191,7 @@ def evaluate(sess, dataloader, model, ksave_dir, mode='valid'):
 
 
     ### new bleu
+    print ksave_dir + mode + "_summary_unk.txt"
     bleu_unk = bleu_score(gold_path, ksave_dir + mode + "_summary_unk.txt")
     nocopy_result = "without copy BLEU: %.4f\n"%bleu_unk
     bleu_copy = bleu_score(gold_path, ksave_dir + mode + "_summary_copy.clean.txt")

diff --git a/OutputUnit.pyc b/OutputUnit.pyc
diff --git a/PythonROUGE.pyc b/PythonROUGE.pyc
diff --git a/SeqUnit.py b/SeqUnit.py
@@ -140,7 +140,7 @@ def __init__(self, batch_size, hidden_size, emb_size, field_size, pos_size, sour
         ### original loss with logits
         #losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=de_outputs, labels=self.decoder_output)
 
-        losses = -tf.reduce_sum(self.decoder_output_one_hot * tf.log(de_outputs), 2)
+        losses = -tf.reduce_sum(self.decoder_output_one_hot * tf.log(de_outputs + 1e-9), 2)
 
 
         mask = tf.sign(tf.to_float(self.decoder_output))
@@ -240,15 +240,14 @@ def loop_fn(t, x_t, s_t, emit_ta, finished):
             ### pointer generator
             #emit_ta = emit_ta.write(t, o_t)
 
-            ### o_weight = len * batch * 1, already normalized. p_gen = batch * 1
+            ### o_weight = batch * len, already normalized. p_gen = batch * 1
             out_dist = p_gen * tf.nn.softmax(o_t) # batch * self.target_vocab
-            att_dist = tf.squeeze(o_weight) # len * batch
-            att_dist = (1 - p_gen) * tf.transpose(att_dist, [1,0]) # batch * len
+            att_dist = (1 - p_gen) * o_weight # batch * len
 
             batch_nums = tf.range(0, limit=batch_size) # shape (batch_size)
             batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1)
-            batch_nums = tf.tile(batch_nums, [1, encoder_len]) # shape (batch_size, attn_len)
-            indices = tf.stack( (batch_nums, self.encoder_input), axis=2) # shape (batch_size, enc_t, 2)
+            batch_nums = tf.tile(batch_nums, [1, encoder_len]) # shape (batch_size, enc_len)
+            indices = tf.stack((batch_nums, self.encoder_input), axis=2) # shape (batch_size, enc_len, 2)
             shape = [batch_size, self.target_vocab]
             attn_dists_projected = tf.scatter_nd(indices, att_dist, shape)
 
@@ -282,11 +281,29 @@ def decoder_g(self, initial_state):
 
         def loop_fn(t, x_t, s_t, emit_ta, att_ta, finished):
             o_t, s_nt = self.dec_lstm(x_t, s_t, finished)
-            o_t, w_t = self.att_layer(o_t)
+            o_t, o_weight, p_gen = self.att_layer(o_t, x_t, s_t)
             o_t = self.dec_out(o_t, finished)
-            emit_ta = emit_ta.write(t, o_t)
-            att_ta = att_ta.write(t, w_t)
-            next_token = tf.arg_max(o_t, 1)
+
+            ### pointer generator
+            #emit_ta = emit_ta.write(t, o_t)
+
+            ### o_weight = batch * len, already normalized. p_gen = batch * 1
+            out_dist = p_gen * tf.nn.softmax(o_t) # batch * self.target_vocab
+            att_dist = (1 - p_gen) * o_weight # batch * len
+
+            batch_nums = tf.range(0, limit=batch_size) # shape (batch_size)
+            batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1)
+            batch_nums = tf.tile(batch_nums, [1, encoder_len]) # shape (batch_size, enc_len)
+            indices = tf.stack((batch_nums, self.encoder_input), axis=2) # shape (batch_size, enc_len, 2)
+            shape = [batch_size, self.target_vocab]
+            attn_dists_projected = tf.scatter_nd(indices, att_dist, shape) # batch * target_vocab
+
+            final_dists = out_dist + attn_dists_projected
+
+
+            emit_ta = emit_ta.write(t, final_dists)
+            att_ta = att_ta.write(t, tf.transpose(o_weight, [1,0]))
+            next_token = tf.arg_max(final_dists, 1)
             x_nt = tf.nn.embedding_lookup(self.embedding, next_token)
             finished = tf.logical_or(finished, tf.equal(next_token, self.stop_token))
             finished = tf.logical_or(finished, tf.greater_equal(t, self.max_length))

diff --git a/SeqUnit.pyc b/SeqUnit.pyc
diff --git a/dualAttentionUnit.py b/dualAttentionUnit.py
@@ -9,6 +9,7 @@
 
 class dualAttentionWrapper(object):
     def __init__(self, emb_size, hidden_size, input_size, field_size, hs, fds, scope_name):
+        ### here input_size == hidden_size
         self.hs = tf.transpose(hs, [1,0,2])  # input_len * batch * input_size
         self.fds = tf.transpose(fds, [1,0,2])
         self.hidden_size = hidden_size
@@ -33,14 +34,16 @@ def __init__(self, emb_size, hidden_size, input_size, field_size, hs, fds, scope
             ### add pointer params
             ### p_gen = sigmod(wh * ht + ws * st + wx * xt + bptr)
             self.wh_ptr = tf.get_variable('wh_ptr', [self.hidden_size, 1])
-            self.ws_ptr = tf.get_variable('ws_ptr', [self.hidden_size, 1])
+            self.ws_ptr = tf.get_variable('ws_ptr', [2*self.hidden_size, 1])
             self.wx_ptr = tf.get_variable('wx_ptr', [self.emb_size, 1])
             self.b_ptr = tf.get_variable('b_ptr', [1])
 
         self.params.update({'Wh': self.Wh, 'Ws': self.Ws, 'Wo': self.Wo,
                             'bh': self.bh, 'bs': self.bs, 'bo': self.bo,
                             'Wf': self.Wf, 'Wr': self.Wr, 
-                            'bf': self.bf, 'br': self.br})
+                            'bf': self.bf, 'br': self.br,
+                            'wh_ptr': self.wh_ptr, 'ws_ptr': self.ws_ptr,
+                            'wx_ptr': self.wx_ptr, 'b_ptr': self.b_ptr})
 
         hs2d = tf.reshape(self.hs, [-1, input_size])
         phi_hs2d = tf.tanh(tf.nn.xw_plus_b(hs2d, self.Wh, self.bh))
@@ -52,12 +55,12 @@ def __init__(self, emb_size, hidden_size, input_size, field_size, hs, fds, scope
     def __call__(self, x, in_t, s_t, coverage = None, finished = None):
         gamma_h = tf.tanh(tf.nn.xw_plus_b(x, self.Ws, self.bs))  # batch * hidden_size
         alpha_h = tf.tanh(tf.nn.xw_plus_b(x, self.Wr, self.br))
-        fd_weights = tf.reduce_sum(self.phi_fds * alpha_h, reduction_indices=2, keep_dims=True)
+        fd_weights = tf.reduce_sum(self.phi_fds * alpha_h, reduction_indices=2, keep_dims=True) # len * batch * 1
         fd_weights = tf.exp(fd_weights - tf.reduce_max(fd_weights, reduction_indices=0, keep_dims=True))
         fd_weights = tf.divide(fd_weights, (1e-6 + tf.reduce_sum(fd_weights, reduction_indices=0, keep_dims=True))) # len * batch * 1
 
 
-        weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True)  # input_len * batch
+        weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True)  # input_len * batch * 1
         weights = tf.exp(weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True))
         weights = tf.divide(weights, (1e-6 + tf.reduce_sum(weights, reduction_indices=0, keep_dims=True)))
         weights = tf.divide(weights * fd_weights, (1e-6 + tf.reduce_sum(weights * fd_weights, reduction_indices=0, keep_dims=True))) # len * batch * 1
@@ -67,11 +70,18 @@ def __call__(self, x, in_t, s_t, coverage = None, finished = None):
 
         #### poniter generator
         ### p_gen = sigmod(wh * ht + ws * st + wx * xt + bptr)
-        p_gen = tf.matmul(context, self.wh_ptr) + tf.matmul(s_t, self.ws_ptr) + tf.matmul(x_t, self.self.wx_ptr) + self.b_ptr
+        h_prev, c_prev = s_t
+        s_t = tf.concat([h_prev, c_prev], 1)
+        p_gen = tf.matmul(context, self.wh_ptr) + tf.matmul(s_t, self.ws_ptr) + tf.matmul(in_t, self.wx_ptr) + self.b_ptr
         p_gen = tf.sigmoid(p_gen) # batch * 1
+        weights = tf.squeeze(weights) # len * batch
+        weights = tf.transpose(weights, [1,0]) # batch * len
 
         if finished is not None:
             out = tf.where(finished, tf.zeros_like(out), out)
+            p_gen = tf.where(finished, tf.zeros_like(p_gen), p_gen)
+            weights = tf.where(finished, tf.zeros_like(weights), weights)
+
         return out, weights, p_gen
 
     def save(self, path):

diff --git a/dualAttentionUnit.pyc b/dualAttentionUnit.pyc
diff --git a/fgateLstmUnit.pyc b/fgateLstmUnit.pyc
diff --git a/multi-bleu.perl b/multi-bleu.perl
@@ -0,0 +1,177 @@
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+# $Id$
+use warnings;
+use strict;
+
+my $lowercase = 0;
+if ($ARGV[0] eq "-lc") {
+  $lowercase = 1;
+  shift;
+}
+
+my $stem = $ARGV[0];
+if (!defined $stem) {
+  print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
+  print STDERR "Reads the references from reference or reference0, reference1, ...\n";
+  exit(1);
+}
+
+$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
+
+my @REF;
+my $ref=0;
+while(-e "$stem$ref") {
+    &add_to_ref("$stem$ref",\@REF);
+    $ref++;
+}
+&add_to_ref($stem,\@REF) if -e $stem;
+die("ERROR: could not find reference file $stem") unless scalar @REF;
+
+# add additional references explicitly specified on the command line
+shift;
+foreach my $stem (@ARGV) {
+    &add_to_ref($stem,\@REF) if -e $stem;
+}
+
+
+
+sub add_to_ref {
+    my ($file,$REF) = @_;
+    my $s=0;
+    if ($file =~ /.gz$/) {
+	open(REF,"gzip -dc $file|") or die "Can't read $file";
+    } else { 
+	open(REF,$file) or die "Can't read $file";
+    }
+    while(<REF>) {
+	chop;
+	push @{$$REF[$s++]}, $_;
+    }
+    close(REF);
+}
+
+my(@CORRECT,@TOTAL,$length_translation,$length_reference);
+my $s=0;
+while(<STDIN>) {
+    chop;
+    $_ = lc if $lowercase;
+    my @WORD = split;
+    my %REF_NGRAM = ();
+    my $length_translation_this_sentence = scalar(@WORD);
+    my ($closest_diff,$closest_length) = (9999,9999);
+    foreach my $reference (@{$REF[$s]}) {
+#      print "$s $_ <=> $reference\n";
+  $reference = lc($reference) if $lowercase;
+	my @WORD = split(' ',$reference);
+	my $length = scalar(@WORD);
+        my $diff = abs($length_translation_this_sentence-$length);
+	if ($diff < $closest_diff) {
+	    $closest_diff = $diff;
+	    $closest_length = $length;
+	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
+	} elsif ($diff == $closest_diff) {
+            $closest_length = $length if $length < $closest_length;
+            # from two references with the same closeness to me
+            # take the *shorter* into account, not the "first" one.
+        }
+	for(my $n=1;$n<=4;$n++) {
+	    my %REF_NGRAM_N = ();
+	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+		my $ngram = "$n";
+		for(my $w=0;$w<$n;$w++) {
+		    $ngram .= " ".$WORD[$start+$w];
+		}
+		$REF_NGRAM_N{$ngram}++;
+	    }
+	    foreach my $ngram (keys %REF_NGRAM_N) {
+		if (!defined($REF_NGRAM{$ngram}) ||
+		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
+		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
+#	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+	}
+    }
+    $length_translation += $length_translation_this_sentence;
+    $length_reference += $closest_length;
+    for(my $n=1;$n<=4;$n++) {
+	my %T_NGRAM = ();
+	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+	    my $ngram = "$n";
+	    for(my $w=0;$w<$n;$w++) {
+		$ngram .= " ".$WORD[$start+$w];
+	    }
+	    $T_NGRAM{$ngram}++;
+	}
+	foreach my $ngram (keys %T_NGRAM) {
+	    $ngram =~ /^(\d+) /;
+	    my $n = $1;
+            # my $corr = 0;
+#	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
+	    $TOTAL[$n] += $T_NGRAM{$ngram};
+	    if (defined($REF_NGRAM{$ngram})) {
+		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
+		    $CORRECT[$n] += $T_NGRAM{$ngram};
+                    # $corr =  $T_NGRAM{$ngram};
+#	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
+		}
+		else {
+		    $CORRECT[$n] += $REF_NGRAM{$ngram};
+                    # $corr =  $REF_NGRAM{$ngram};
+#	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+            # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
+            # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
+	}
+    }
+    $s++;
+}
+my $brevity_penalty = 1;
+my $bleu = 0;
+
+my @bleu=();
+
+for(my $n=1;$n<=4;$n++) {
+  if (defined ($TOTAL[$n])){
+    $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
+    # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
+  }else{
+    $bleu[$n]=0;
+  }
+}
+
+if ($length_reference==0){
+  printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
+  exit(1);
+}
+
+if ($length_translation<$length_reference) {
+  $brevity_penalty = exp(1-$length_reference/$length_translation);
+}
+$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
+				my_log( $bleu[2] ) +
+				my_log( $bleu[3] ) +
+				my_log( $bleu[4] ) ) / 4) ;
+printf "BLEU = %.4f, %.4f/%.4f/%.4f/%.4f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
+    100*$bleu,
+    100*$bleu[1],
+    100*$bleu[2],
+    100*$bleu[3],
+    100*$bleu[4],
+    $brevity_penalty,
+    $length_translation / $length_reference,
+    $length_translation,
+    $length_reference;
+
+
+print STDERR "It is in-advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
+
+sub my_log {
+  return -9999999999 unless $_[0];
+  return log($_[0]);
+}