Skip to content

Commit

Permalink
update scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Spico197 committed Jun 19, 2023
1 parent a0c2b24 commit 08acb19
Show file tree
Hide file tree
Showing 24 changed files with 661 additions and 55 deletions.
102 changes: 87 additions & 15 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion conf/Pretrain_excluded.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# task
task_type: SchemaGuidedInstructBertTask
task_name: Mirror_Pretrain_AllExcluded
task_name: Mirror_Pretrain_AllExcluded_2
comment: '~~content as label, (start, end + 1) span'

# data preprocessing
Expand Down
2 changes: 1 addition & 1 deletion conf/Pretrain_v1.5.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# task
task_type: SchemaGuidedInstructBertTask
task_name: Mirror_Pretrain_DataV1.5
task_name: Mirror_Pretrain_DataV1.5_2
comment: '~~content as label, (start, end + 1) span'

# data preprocessing
Expand Down
51 changes: 51 additions & 0 deletions conf/Pretrain_v1.5_woInstruction.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# task
task_type: SchemaGuidedInstructBertTask
task_name: Mirror_Pretrain_DataV1.5_woInstruction
comment: '~~content as label, (start, end + 1) span'

# data preprocessing
max_seq_len: 512
debug_mode: false
label_span: tag # tag `[LM]` or content `person`
mode: span # w2 (1,2,3) or span (1,3)
stream_mode: false

# filepaths
plm_dir: /data/tzhu/PLM/microsoft--deberta-v3-large
data_dir: resources/Mirror/v1.5/merged/t-rex-200k-woInstruction/remove_instruction
output_dir: mirror_outputs
task_dir: ${output_dir}/${task_name}
train_filepath: ${data_dir}/train.jsonl
dev_filepath: ${data_dir}/dev.jsonl
test_filepath: ${data_dir}/test.jsonl
dump_cache_dir: ${task_dir}/cache
regenerate_cache: false

# training
random_seed: 1227
base_model_path: null
eval_on_data: [train]
select_best_on_data: train
select_best_by_key: loss
final_eval_on_test: false
save_every_ckpt: true
save_best_ckpt: true

warmup_proportion: 0.1
num_epochs: 3
epoch_patience: -1
num_steps: -1
step_patience: -1
step_eval_interval: 10000
train_batch_size: 8
eval_batch_size: 8
grad_accum_steps: 1
learning_rate: !!float 2e-5
other_learning_rate: !!float 1e-4
max_grad_norm: 1.0
weight_decay: 0.1

# model
dropout: 0.3
use_rope: true
biaffine_size: 512
3 changes: 2 additions & 1 deletion conf/uie_data/fewshot.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
num_epochs: 200
epoch_patience: 10
output_dir: mirror_fewshot_outputs
base_model_path: mirror_outputs/MirrorLarge_SamplingPretrain_woLowResource_woOverlap/ckpt/SchemaGuidedInstructBertModel.best.pth
base_model_path: mirror_outputs/Mirror_Pretrain_AllExcluded_2/ckpt/SchemaGuidedInstructBertModel.best.pth
save_every_ckpt: false
5 changes: 4 additions & 1 deletion conf/uie_data/wPretrain.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
plm_dir: /data/tzhu/PLM/microsoft--deberta-v3-large
base_model_path: mirror_outputs/MirrorLarge_SamplingPretrain_woOverlap/ckpt/SchemaGuidedInstructBertModel.best.pth
base_model_path: mirror_outputs/Mirror_Pretrain_AllExcluded_2/ckpt/SchemaGuidedInstructBertModel.best.pth

stream_mode: false
train_filepath: ${data_dir}/train.jsonl
Expand All @@ -8,6 +8,9 @@ test_filepath: ${data_dir}/test.jsonl

num_epochs: 20
epoch_patience: 3
num_steps: -1
step_patience: -1
step_eval_interval: -1

eval_on_data: [dev]
select_best_on_data: dev
Expand Down
76 changes: 69 additions & 7 deletions eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@
# task_dir = "mirror_outputs/Mirror_UIE_wPT_woOverlapV2"
# task_dir = "mirror_outputs/Mirror_ExcludedPretrain_MultiTask"
# task_dir = "mirror_outputs/MirrorLarge_SamplingPretrain_woZeroShotNER"
task_dir = "mirror_outputs/MirrorLarge_SamplingPretrain_woOverlap"
# task_dir = "mirror_outputs/MirrorLarge_SamplingPretrain_woOverlap"
# task_dir = "mirror_outputs/MirrorLarge_SamplingPretrain_woLowResource_woOverlap"
# task_dir = "mirror_outputs/Mirror_Pretrain_DataV1.5_2"
# task_dir = "mirror_outputs/Mirror_Pretrain_AllExcluded_2"
task_dir = "mirror_outputs/Mirror_Pretrain_DataV1.5_woInstruction"
task: SchemaGuidedInstructBertTask = SchemaGuidedInstructBertTask.from_taskdir(
task_dir,
load_best_model=True,
Expand Down Expand Up @@ -75,12 +78,18 @@
# ["ent_politics", "resources/Mirror/v1.4/ent/en/CrossNER_politics/instructed/test.jsonl"],
# ["ent_science", "resources/Mirror/v1.4/ent/en/CrossNER_science/instructed/test.jsonl"],

# zero-shot NER w/o instructions
["ent_movie", "resources/Mirror/v1.4/ent/en/MIT_MOVIE_Review/instructed/remove_instruction/test.jsonl"],
["ent_restaurant", "resources/Mirror/v1.4/ent/en/MIT_Restaurant_Review/instructed/remove_instruction/test.jsonl"],
["ent_ai", "resources/Mirror/v1.4/ent/en/CrossNER_AI/instructed/remove_instruction/test.jsonl"],
["ent_literature", "resources/Mirror/v1.4/ent/en/CrossNER_literature/instructed/remove_instruction/test.jsonl"],
["ent_music", "resources/Mirror/v1.4/ent/en/CrossNER_music/instructed/remove_instruction/test.jsonl"],
["ent_politics", "resources/Mirror/v1.4/ent/en/CrossNER_politics/instructed/remove_instruction/test.jsonl"],
["ent_science", "resources/Mirror/v1.4/ent/en/CrossNER_science/instructed/remove_instruction/test.jsonl"],
# # discontinuous NER
# ["discontinuous_ent", "resources/Mirror/new_abilities_v2/cadec/new/test.jsonl"],

# # hyper-RE
# ["hyper_rel", "resources/Mirror/new_abilities_v2/HyperRED/new/test.jsonl"],

# # glue
# ["cls_glue_cola", "resources/Mirror/v1.4/cls/en/CoLA/formated/test.jsonl"],
# ["cls_glue_qqp", "resources/Mirror/v1.4/cls/en/QQP/new/dev.jsonl"],
Expand All @@ -89,10 +98,8 @@
# ["cls_glue_qnli", "resources/Mirror/v1.4/cls/en/QNLI/processed/QNLI_dev.jsonl"],
# ["cls_glue_rte", "resources/Mirror/v1.4/cls/en/RTE/formated/RTE_dev.jsonl"],
# ["cls_glue_mrpc", "resources/Mirror/v1.4/cls/en/MRPC/formated/dev.jsonl"],

# # mrc
# ["span_squad2", "resources/Mirror/v1.4/span/en/squad_v2/dev.jsonl"],

# Mirror v1.4 all train
# ["cls_ag_news_train", "resources/Mirror/v1.4/cls/en/ag_news/instructed/train.jsonl"],
# ["cls_ANLI_R1_train", "resources/Mirror/v1.4/cls/en/ANLI/R1_processed/train.jsonl"],
Expand Down Expand Up @@ -156,9 +163,8 @@
# ["span_ms_marco_v2.1", "resources/Mirror/v1.4/span/en/ms_marco_v2.1/train.jsonl"],
# ["span_newsqa", "resources/Mirror/v1.4/span/en/newsqa/train.jsonl"],
# ["span_squad_v2", "resources/Mirror/v1.4/span/en/squad_v2/train.jsonl"],

# # Mirror v1.4 all test
["cls_ag_news_test", "resources/Mirror/v1.4/cls/en/ag_news/instructed/test.jsonl"],
# ["cls_ag_news_test", "resources/Mirror/v1.4/cls/en/ag_news/instructed/test.jsonl"],
# ["cls_ANLI_R1_test", "resources/Mirror/v1.4/cls/en/ANLI/R1_processed/test.jsonl"],
# ["cls_ANLI_R2_test", "resources/Mirror/v1.4/cls/en/ANLI/R2_processed/test.jsonl"],
# ["cls_ANLI_R3_test", "resources/Mirror/v1.4/cls/en/ANLI/R3_processed/test.jsonl"],
Expand Down Expand Up @@ -817,4 +823,60 @@
│ span_em │ span_subjqa_tripadvisor_train │ 38.765 │
│ span_f1 │ span_subjqa_tripadvisor_train │ 36.225 │
└─────────┴─────────────────────────────────┴────────────┘
mirror_outputs/Mirror_Pretrain_DataV1.
5_2
┏━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Task ┃ Dataset ┃ Metric (%) ┃
┡━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ ent │ ent_movie │ 34.907 │
│ ent │ ent_restaurant │ 20.259 │
│ ent │ ent_ai │ 23.115 │
│ ent │ ent_literature │ 36.566 │
│ ent │ ent_music │ 33.716 │
│ ent │ ent_politics │ 48.165 │
│ ent │ ent_science │ 44.995 │
└──────┴────────────────┴────────────┘
mirror_outputs/Mirror_Pretrain_AllExcl
uded_2
┏━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Task ┃ Dataset ┃ Metric (%) ┃
┡━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ ent │ ent_movie │ 38.360 │
│ ent │ ent_restaurant │ 13.511 │
│ ent │ ent_ai │ 42.998 │
│ ent │ ent_literature │ 41.638 │
│ ent │ ent_music │ 56.655 │
│ ent │ ent_politics │ 68.906 │
│ ent │ ent_science │ 53.454 │
└──────┴────────────────┴────────────┘
mirror_outputs/Mirror_Pretrain_AllExcl
uded_2
┏━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Task ┃ Dataset ┃ Metric (%) ┃
┡━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ ent │ ent_movie │ 39.201 │
│ ent │ ent_restaurant │ 16.318 │
│ ent │ ent_ai │ 45.230 │
│ ent │ ent_literature │ 46.318 │
│ ent │ ent_music │ 58.611 │
│ ent │ ent_politics │ 67.303 │
│ ent │ ent_science │ 54.837 │
└──────┴────────────────┴────────────┘
mirror_outputs/Mirror_Pretrain_DataV1.
5_woInstruction
┏━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Task ┃ Dataset ┃ Metric (%) ┃
┡━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ ent │ ent_movie │ 30.864 │
│ ent │ ent_restaurant │ 12.439 │
│ ent │ ent_ai │ 32.317 │
│ ent │ ent_literature │ 40.048 │
│ ent │ ent_music │ 40.600 │
│ ent │ ent_politics │ 46.247 │
│ ent │ ent_science │ 42.422 │
└──────┴────────────────┴────────────┘
"""
33 changes: 33 additions & 0 deletions resources/Mirror/remove_instruction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from pathlib import Path

from rex.utils.io import dump_jsonlines, load_jsonlines

# data_dir = Path("resources/Mirror/uie/rel/scierc")
# data_dir = Path("resources/Mirror/v1.5/merged/t-rex-200k-woInstruction")


def remove_instruction(data_dir):
for fname in ["train.jsonl", "dev.jsonl", "test.jsonl"]:
p = data_dir / fname
data = load_jsonlines(p)
new_data = []
for d in data:
del d["instruction"]
new_data.append(d)
dump_dir = data_dir / "remove_instruction"
dump_dir.mkdir(parents=True, exist_ok=True)
dump_jsonlines(new_data, dump_dir / fname)


if __name__ == "__main__":
data_dirs = [
"resources/Mirror/v1.4/ent/en/CrossNER_AI/instructed",
"resources/Mirror/v1.4/ent/en/CrossNER_literature/instructed",
"resources/Mirror/v1.4/ent/en/CrossNER_music/instructed",
"resources/Mirror/v1.4/ent/en/CrossNER_politics/instructed",
"resources/Mirror/v1.4/ent/en/CrossNER_science/instructed",
"resources/Mirror/v1.4/ent/en/MIT_MOVIE_Review/instructed",
"resources/Mirror/v1.4/ent/en/MIT_Restaurant_Review/instructed",
]
for data_dir in data_dirs:
remove_instruction(Path(data_dir))
16 changes: 0 additions & 16 deletions resources/Mirror/uie/remove_instruction.py

This file was deleted.

Loading

0 comments on commit 08acb19

Please sign in to comment.