Update README and fix preprocess

THUDM · Dec 22, 2021 · 28b7f1f · 28b7f1f
1 parent 5416d2f
commit 28b7f1f
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -48,14 +48,33 @@ Verify that KOBE is correctly installed by `import kobe`.
 ### Dataset
 
 We use the **TaoDescribe** dataset, which contains 2,129,187 product titles and descriptions in Chinese.
-<!-- - (optional) You can download the un-preprocessed dataset from [here](https://www.dropbox.com/sh/nnnq9eobmn6u44v/AAA7s4YkVbslS-6slDIOn4MYa) or [here (for users in China)](https://tianchi.aliyun.com/dataset/dataDetail?dataId=9717). -->
 
-Run the following command to download the dataset:
+Run the following command to automatically download the dataset:
 
 ```bash
 python -m kobe.data.download
 ```
 
+The downloaded files will be placed at `saved/raw/`:
+
+```
+18G KOBE/saved
+ 1.6G ├──raw
+  42K │  ├──test.cond
+ 1.4M │  ├──test.desc
+ 2.0M │  ├──test.fact
+ 450K │  ├──test.title
+  17M │  ├──train.cond
+ 553M │  ├──train.desc
+ 794M │  ├──train.fact
+ 183M │  ├──train.title
+  80K │  ├──valid.cond
+ 2.6M │  ├──valid.desc
+ 3.7M │  ├──valid.fact
+ 853K │  └──valid.title
+...
+```
+
 <details>
 <summary>
 Meanings of downloaded data files
@@ -68,9 +87,6 @@ Meanings of downloaded data files
 </ul>
 </details>
 
-<!-- - First, download the preprocessed TaoDescribe dataset by running `python scripts/download_preprocessed_tao.py`.
-    - If you're in regions where Dropbox are blocked (e.g. Mainland China), try `python scripts/download_preprocessed_tao.py --cn`. -->
-
 ## Preprocessing
 
 Preprocessing is a commonly neglected part in code release. However, we now provide the preprocessing scripts to rebuild the vocabulary and tokenize the texts, just in case that you wish to preprocess the KOBE data yourself or need to run on your own data.
@@ -88,7 +104,7 @@ python -m kobe.data.vocab \
 
 ### Tokenization
 
-Then, we tokenize the raw inputs and save the preprocessed samples to `.tar` files.
+Then, we will tokenize the raw inputs and save the preprocessed samples to `.tar` files. Note: this process can take a while (about 20 minutes with a 8-core processor).
 
 ```bash
 python -m kobe.data.preprocess \
@@ -99,9 +115,9 @@ python -m kobe.data.preprocess \
   --cond-vocab-file saved/vocab.cond.model
 ```
 
-You can peek into the `saved/raw/` and `saved/processed/` directories to see what these preprocessing scripts did!
+You can peek into the `saved/` directories to see what these preprocessing scripts did:
 
-```bash
+```
  18G KOBE/saved
   16G ├──processed
   20M │  ├──test.tar
@@ -113,21 +129,9 @@ You can peek into the `saved/raw/` and `saved/processed/` directories to see wha
  1.0G │  ├──train-5.tar
  1.0G │  ├──train-6.tar
  1.0G │  ├──train-7.tar
- 8.1G │  ├──train.tar
   38M │  └──valid.tar
  1.6G ├──raw
-  42K │  ├──test.cond
- 1.4M │  ├──test.desc
- 2.0M │  ├──test.fact
- 450K │  ├──test.title
-  17M │  ├──train.cond
- 553M │  ├──train.desc
- 794M │  ├──train.fact
- 183M │  ├──train.title
-  80K │  ├──valid.cond
- 2.6M │  ├──valid.desc
- 3.7M │  ├──valid.fact
- 853K │  └──valid.title
+      │  ├──...
  238K └──vocab.cond.model
 ```
 
@@ -152,13 +156,19 @@ python -m kobe.train --mode kobe-know --name kobe-know
 python -m kobe.train --mode kobe-full --name kobe-full
 ```
 
-After launching any of the experiment above, please go to the WandB link printed out in the terminal to view the training/validation loss, BLEU, and even the generated examples (updated once every epoch) there!
+After launching any of the experiment above, please go to the WandB link printed out in the terminal to view the training progress and evaluation results (updated at every epoch end about once per 2 hours).
 
-If you would like to change other hyperparameters, please look at `kobe/utils/options.py`.
+If you would like to change other hyperparameters, please look at `kobe/utils/options.py`. For example, the default setting train the models for 30 epochs with batch size 64, which is around 1 millison steps. You could add options like `--epochs 100` to train for more epochs and obtain better results. You can also increase `--num-encoder-layers` and `--num-decoder-layers` if better GPUs available.
 
-### Testing KOBE
+### Evaluating KOBE
+
+Evaluation is now super convenient and reproducible with the help of pytorch-lightning and WandB. The checkpoint with best bleu score will be saved at `kobe-v2/<wandb-run-id>/checkpoints/<best_epoch-best_step>.ckpt`. To evaluate this model, run the following command:
+
+```bash
+python -m kobe.train --mode baseline --name test-baseline --test --load-file kobe-v2/<wandb-run-id>/checkpoints/<best_epoch-best_step>.ckpt
+```
 
-TODO
+The results will be displayed on the WandB dashboard with the link printed out in the terminal. The evaluation metrics we provide include BLEU score, diversity score and [BERTScore](https://arxiv.org/abs/1904.09675). You can also manually view some generated examples and their references under the `examples/` section on WandB.
 
 ## Cite
 

diff --git a/kobe/data/preprocess.py b/kobe/data/preprocess.py
@@ -97,7 +97,7 @@ def write_to_tar(fname, examples):
     if len(examples) > 10000:
         # save to shards for training data
         shard_size = (len(examples) + 7) // 8
-        for shard_id in range(0, len(examples), shard_size):
+        for shard_id in range(8):
             write_to_tar(
                 f"{output}-{shard_id}.tar",
                 examples[shard_id * shard_size : (shard_id + 1) * shard_size],

diff --git a/kobe/data/vocab.py b/kobe/data/vocab.py
@@ -4,6 +4,7 @@
 import sentencepiece as spm
 from transformers.models.bert.tokenization_bert import BertTokenizer
 
+# Load the text tokenizer
 tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
 
 BOS_TOKEN = tokenizer.cls_token
@@ -14,12 +15,13 @@
 EOS_ID = tokenizer.sep_token_id
 UNK_ID = tokenizer.unk_token_id
 
+# Build the condition (attribute) tokenizer
 if __name__ == "__main__":
     parser = ArgumentParser()
     # fmt: off
     parser.add_argument("--input", nargs="+", required=True)
     parser.add_argument("--vocab-file", type=str, required=True)
-    parser.add_argument("--vocab-size", type=int, default=5000)
+    parser.add_argument("--vocab-size", type=int, default=31)
     parser.add_argument("--algo", type=str, default="bpe", choices=["bpe", "word"])
     # fmt: on
     args = parser.parse_args()
@@ -31,7 +33,7 @@
                 f.write(input_f.read() + "\n")
         # run sentence piece with bpe
         spm.SentencePieceTrainer.Train(
-            f"--add_dummy_prefix=false --pad_id={PAD_ID} --bos_id={BOS_ID} --eos_id={EOS_ID} --unk_id={UNK_ID} "
+            f"--add_dummy_prefix=false --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3 "
             f"--vocab_size={args.vocab_size} "
             f"--model_prefix={args.vocab_file} --model_type={args.algo} "
             f"--input={f.name}"

diff --git a/kobe/utils/options.py b/kobe/utils/options.py
@@ -18,7 +18,7 @@ def add_options(parser: ArgumentParser):
     parser.add_argument("--d-model", default=512, type=int)
     parser.add_argument("--nhead", default=8, type=int)
     parser.add_argument("--num-encoder-layers", default=6, type=int)
-    parser.add_argument("--num-decoder-layers", default=2, type=int)
+    parser.add_argument("--num-decoder-layers", default=6, type=int)
     parser.add_argument("--max-seq-len", default=256, type=int)
     parser.add_argument("--mode", default="baseline", type=str, choices=[
         helpers.BASELINE, helpers.KOBE_ATTRIBUTE, helpers.KOBE_KNOWLEDGE, helpers.KOBE_FULL])
@@ -27,8 +27,8 @@ def add_options(parser: ArgumentParser):
     parser.add_argument("--name", default="exp", type=str, help="expeirment name")
     parser.add_argument("--gpu", default=1, type=int)
     parser.add_argument("--grad-clip", default=1.0, type=float, help="clip threshold of gradients")
-    parser.add_argument("--epochs", default=300, type=int, help="number of epochs to train")
-    parser.add_argument("--patience", default=100, type=int, help="early stopping patience")
+    parser.add_argument("--epochs", default=30, type=int, help="number of epochs to train")
+    parser.add_argument("--patience", default=10, type=int, help="early stopping patience")
     parser.add_argument("--lr", default=1, type=float, help="learning rate")
     parser.add_argument("--dropout", default=0.1, type=float, help="dropout rate")
     parser.add_argument("--batch-size", default=64, type=int)