update scripts

Spico197 · Jun 19, 2023 · 08acb19 · 08acb19
1 parent a0c2b24
commit 08acb19
Show file tree

Hide file tree

Showing 24 changed files with 661 additions and 55 deletions.
diff --git a/README.md b/README.md
diff --git a/conf/Pretrain_excluded.yaml b/conf/Pretrain_excluded.yaml
@@ -1,6 +1,6 @@
 # task
 task_type: SchemaGuidedInstructBertTask
-task_name: Mirror_Pretrain_AllExcluded
+task_name: Mirror_Pretrain_AllExcluded_2
 comment: '~~content as label, (start, end + 1) span'
 
 # data preprocessing

diff --git a/conf/Pretrain_v1.5.yaml b/conf/Pretrain_v1.5.yaml
@@ -1,6 +1,6 @@
 # task
 task_type: SchemaGuidedInstructBertTask
-task_name: Mirror_Pretrain_DataV1.5
+task_name: Mirror_Pretrain_DataV1.5_2
 comment: '~~content as label, (start, end + 1) span'
 
 # data preprocessing

diff --git a/conf/Pretrain_v1.5_woInstruction.yaml b/conf/Pretrain_v1.5_woInstruction.yaml
@@ -0,0 +1,51 @@
+# task
+task_type: SchemaGuidedInstructBertTask
+task_name: Mirror_Pretrain_DataV1.5_woInstruction
+comment: '~~content as label, (start, end + 1) span'
+
+# data preprocessing
+max_seq_len: 512
+debug_mode: false
+label_span: tag  # tag `[LM]` or content `person`
+mode: span  # w2 (1,2,3) or span (1,3)
+stream_mode: false
+
+# filepaths
+plm_dir: /data/tzhu/PLM/microsoft--deberta-v3-large
+data_dir: resources/Mirror/v1.5/merged/t-rex-200k-woInstruction/remove_instruction
+output_dir: mirror_outputs
+task_dir: ${output_dir}/${task_name}
+train_filepath: ${data_dir}/train.jsonl
+dev_filepath: ${data_dir}/dev.jsonl
+test_filepath: ${data_dir}/test.jsonl
+dump_cache_dir: ${task_dir}/cache
+regenerate_cache: false
+
+# training
+random_seed: 1227
+base_model_path: null
+eval_on_data: [train]
+select_best_on_data: train
+select_best_by_key: loss
+final_eval_on_test: false
+save_every_ckpt: true
+save_best_ckpt: true
+
+warmup_proportion: 0.1
+num_epochs: 3
+epoch_patience: -1
+num_steps: -1
+step_patience: -1
+step_eval_interval: 10000
+train_batch_size: 8
+eval_batch_size: 8
+grad_accum_steps: 1
+learning_rate: !!float 2e-5
+other_learning_rate: !!float 1e-4
+max_grad_norm: 1.0
+weight_decay: 0.1
+
+# model
+dropout: 0.3
+use_rope: true
+biaffine_size: 512
diff --git a/conf/uie_data/fewshot.yaml b/conf/uie_data/fewshot.yaml
@@ -1,4 +1,5 @@
 num_epochs: 200
 epoch_patience: 10
 output_dir: mirror_fewshot_outputs
-base_model_path: mirror_outputs/MirrorLarge_SamplingPretrain_woLowResource_woOverlap/ckpt/SchemaGuidedInstructBertModel.best.pth
+base_model_path: mirror_outputs/Mirror_Pretrain_AllExcluded_2/ckpt/SchemaGuidedInstructBertModel.best.pth
+save_every_ckpt: false
diff --git a/conf/uie_data/wPretrain.yaml b/conf/uie_data/wPretrain.yaml
@@ -1,5 +1,5 @@
 plm_dir: /data/tzhu/PLM/microsoft--deberta-v3-large
-base_model_path: mirror_outputs/MirrorLarge_SamplingPretrain_woOverlap/ckpt/SchemaGuidedInstructBertModel.best.pth
+base_model_path: mirror_outputs/Mirror_Pretrain_AllExcluded_2/ckpt/SchemaGuidedInstructBertModel.best.pth
 
 stream_mode: false
 train_filepath: ${data_dir}/train.jsonl
@@ -8,6 +8,9 @@ test_filepath: ${data_dir}/test.jsonl
 
 num_epochs: 20
 epoch_patience: 3
+num_steps: -1
+step_patience: -1
+step_eval_interval: -1
 
 eval_on_data: [dev]
 select_best_on_data: dev

diff --git a/eval.py b/eval.py
@@ -22,8 +22,11 @@
 # task_dir = "mirror_outputs/Mirror_UIE_wPT_woOverlapV2"
 # task_dir = "mirror_outputs/Mirror_ExcludedPretrain_MultiTask"
 # task_dir = "mirror_outputs/MirrorLarge_SamplingPretrain_woZeroShotNER"
-task_dir = "mirror_outputs/MirrorLarge_SamplingPretrain_woOverlap"
+# task_dir = "mirror_outputs/MirrorLarge_SamplingPretrain_woOverlap"
 # task_dir = "mirror_outputs/MirrorLarge_SamplingPretrain_woLowResource_woOverlap"
+# task_dir = "mirror_outputs/Mirror_Pretrain_DataV1.5_2"
+# task_dir = "mirror_outputs/Mirror_Pretrain_AllExcluded_2"
+task_dir = "mirror_outputs/Mirror_Pretrain_DataV1.5_woInstruction"
 task: SchemaGuidedInstructBertTask = SchemaGuidedInstructBertTask.from_taskdir(
     task_dir,
     load_best_model=True,
@@ -75,12 +78,18 @@
     # ["ent_politics", "resources/Mirror/v1.4/ent/en/CrossNER_politics/instructed/test.jsonl"],
     # ["ent_science", "resources/Mirror/v1.4/ent/en/CrossNER_science/instructed/test.jsonl"],
 
+    # zero-shot NER w/o instructions
+    ["ent_movie", "resources/Mirror/v1.4/ent/en/MIT_MOVIE_Review/instructed/remove_instruction/test.jsonl"],
+    ["ent_restaurant", "resources/Mirror/v1.4/ent/en/MIT_Restaurant_Review/instructed/remove_instruction/test.jsonl"],
+    ["ent_ai", "resources/Mirror/v1.4/ent/en/CrossNER_AI/instructed/remove_instruction/test.jsonl"],
+    ["ent_literature", "resources/Mirror/v1.4/ent/en/CrossNER_literature/instructed/remove_instruction/test.jsonl"],
+    ["ent_music", "resources/Mirror/v1.4/ent/en/CrossNER_music/instructed/remove_instruction/test.jsonl"],
+    ["ent_politics", "resources/Mirror/v1.4/ent/en/CrossNER_politics/instructed/remove_instruction/test.jsonl"],
+    ["ent_science", "resources/Mirror/v1.4/ent/en/CrossNER_science/instructed/remove_instruction/test.jsonl"],
     # # discontinuous NER
     # ["discontinuous_ent", "resources/Mirror/new_abilities_v2/cadec/new/test.jsonl"],
-
     # # hyper-RE
     # ["hyper_rel", "resources/Mirror/new_abilities_v2/HyperRED/new/test.jsonl"],
-
     # # glue
     # ["cls_glue_cola", "resources/Mirror/v1.4/cls/en/CoLA/formated/test.jsonl"],
     # ["cls_glue_qqp", "resources/Mirror/v1.4/cls/en/QQP/new/dev.jsonl"],
@@ -89,10 +98,8 @@
     # ["cls_glue_qnli", "resources/Mirror/v1.4/cls/en/QNLI/processed/QNLI_dev.jsonl"],
     # ["cls_glue_rte", "resources/Mirror/v1.4/cls/en/RTE/formated/RTE_dev.jsonl"],
     # ["cls_glue_mrpc", "resources/Mirror/v1.4/cls/en/MRPC/formated/dev.jsonl"],
-
     # # mrc
     # ["span_squad2", "resources/Mirror/v1.4/span/en/squad_v2/dev.jsonl"],
-
     # Mirror v1.4 all train
     # ["cls_ag_news_train", "resources/Mirror/v1.4/cls/en/ag_news/instructed/train.jsonl"],
     # ["cls_ANLI_R1_train", "resources/Mirror/v1.4/cls/en/ANLI/R1_processed/train.jsonl"],
@@ -156,9 +163,8 @@
     # ["span_ms_marco_v2.1", "resources/Mirror/v1.4/span/en/ms_marco_v2.1/train.jsonl"],
     # ["span_newsqa", "resources/Mirror/v1.4/span/en/newsqa/train.jsonl"],
     # ["span_squad_v2", "resources/Mirror/v1.4/span/en/squad_v2/train.jsonl"],
-
     # # Mirror v1.4 all test
-    ["cls_ag_news_test", "resources/Mirror/v1.4/cls/en/ag_news/instructed/test.jsonl"],
+    # ["cls_ag_news_test", "resources/Mirror/v1.4/cls/en/ag_news/instructed/test.jsonl"],
     # ["cls_ANLI_R1_test", "resources/Mirror/v1.4/cls/en/ANLI/R1_processed/test.jsonl"],
     # ["cls_ANLI_R2_test", "resources/Mirror/v1.4/cls/en/ANLI/R2_processed/test.jsonl"],
     # ["cls_ANLI_R3_test", "resources/Mirror/v1.4/cls/en/ANLI/R3_processed/test.jsonl"],
@@ -817,4 +823,60 @@
 │ span_em │ span_subjqa_tripadvisor_train   │     38.765 │
 │ span_f1 │ span_subjqa_tripadvisor_train   │     36.225 │
 └─────────┴─────────────────────────────────┴────────────┘
+
+mirror_outputs/Mirror_Pretrain_DataV1.
+                 5_2
+┏━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃ Task ┃ Dataset        ┃ Metric (%) ┃
+┡━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ ent  │ ent_movie      │     34.907 │
+│ ent  │ ent_restaurant │     20.259 │
+│ ent  │ ent_ai         │     23.115 │
+│ ent  │ ent_literature │     36.566 │
+│ ent  │ ent_music      │     33.716 │
+│ ent  │ ent_politics   │     48.165 │
+│ ent  │ ent_science    │     44.995 │
+└──────┴────────────────┴────────────┘
+
+mirror_outputs/Mirror_Pretrain_AllExcl
+                uded_2
+┏━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃ Task ┃ Dataset        ┃ Metric (%) ┃
+┡━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ ent  │ ent_movie      │     38.360 │
+│ ent  │ ent_restaurant │     13.511 │
+│ ent  │ ent_ai         │     42.998 │
+│ ent  │ ent_literature │     41.638 │
+│ ent  │ ent_music      │     56.655 │
+│ ent  │ ent_politics   │     68.906 │
+│ ent  │ ent_science    │     53.454 │
+└──────┴────────────────┴────────────┘
+
+mirror_outputs/Mirror_Pretrain_AllExcl
+                uded_2
+┏━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃ Task ┃ Dataset        ┃ Metric (%) ┃
+┡━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ ent  │ ent_movie      │     39.201 │
+│ ent  │ ent_restaurant │     16.318 │
+│ ent  │ ent_ai         │     45.230 │
+│ ent  │ ent_literature │     46.318 │
+│ ent  │ ent_music      │     58.611 │
+│ ent  │ ent_politics   │     67.303 │
+│ ent  │ ent_science    │     54.837 │
+└──────┴────────────────┴────────────┘
+
+mirror_outputs/Mirror_Pretrain_DataV1.
+           5_woInstruction
+┏━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃ Task ┃ Dataset        ┃ Metric (%) ┃
+┡━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ ent  │ ent_movie      │     30.864 │
+│ ent  │ ent_restaurant │     12.439 │
+│ ent  │ ent_ai         │     32.317 │
+│ ent  │ ent_literature │     40.048 │
+│ ent  │ ent_music      │     40.600 │
+│ ent  │ ent_politics   │     46.247 │
+│ ent  │ ent_science    │     42.422 │
+└──────┴────────────────┴────────────┘
 """
diff --git a/resources/Mirror/remove_instruction.py b/resources/Mirror/remove_instruction.py
@@ -0,0 +1,33 @@
+from pathlib import Path
+
+from rex.utils.io import dump_jsonlines, load_jsonlines
+
+# data_dir = Path("resources/Mirror/uie/rel/scierc")
+# data_dir = Path("resources/Mirror/v1.5/merged/t-rex-200k-woInstruction")
+
+
+def remove_instruction(data_dir):
+    for fname in ["train.jsonl", "dev.jsonl", "test.jsonl"]:
+        p = data_dir / fname
+        data = load_jsonlines(p)
+        new_data = []
+        for d in data:
+            del d["instruction"]
+            new_data.append(d)
+        dump_dir = data_dir / "remove_instruction"
+        dump_dir.mkdir(parents=True, exist_ok=True)
+        dump_jsonlines(new_data, dump_dir / fname)
+
+
+if __name__ == "__main__":
+    data_dirs = [
+        "resources/Mirror/v1.4/ent/en/CrossNER_AI/instructed",
+        "resources/Mirror/v1.4/ent/en/CrossNER_literature/instructed",
+        "resources/Mirror/v1.4/ent/en/CrossNER_music/instructed",
+        "resources/Mirror/v1.4/ent/en/CrossNER_politics/instructed",
+        "resources/Mirror/v1.4/ent/en/CrossNER_science/instructed",
+        "resources/Mirror/v1.4/ent/en/MIT_MOVIE_Review/instructed",
+        "resources/Mirror/v1.4/ent/en/MIT_Restaurant_Review/instructed",
+    ]
+    for data_dir in data_dirs:
+        remove_instruction(Path(data_dir))
diff --git a/resources/Mirror/uie/remove_instruction.py b/resources/Mirror/uie/remove_instruction.py