configs/ner.conf

conll03_ner {
  task = "ner"
  dataset = "conll03_ner"

  data_dir = ${ASP}/data/conll03_ner/
  model_dir = ${ASP}/data/conll03_ner/
  log_root = /cluster/home/tialiu/tianyu/gen_coref_dump/ner/

  max_segment_len = 256
  # Learning
  optimizer = "adamw"
  use_amp = true
  plm_learning_rate = 5e-5
  task_learning_rate = 1e-4
  plm_scheduler = "linear_with_warmup" # constant / constant_with_warmup / linear_with_warmup
  task_scheduler = "linear_with_warmup"
  warmup_ratio = 0.05
  
  adam_eps = 1e-8
  adam_weight_decay = 0.1

  max_grad_norm = 1  # Set 0 to disable clipping
  gradient_accumulation_steps = 1
  batch_size = 1
  num_epochs = 20

  # Model hyperparameters.
  activation = "relu"
  init_std = 0.02
  feature_emb_size = 20
  hidden_size = 150

  dropout_rate = 0.3

  # number of types
  num_typing_classes = 4
  # Other.
  beam_size = 1

  eval_frequency = 1000
  report_frequency = 50

  plm_tokenizer_name = t5-small
}

t5_base = ${conll03_ner}{
  plm_learning_rate = 5e-5
  task_learning_rate = 3e-4
  
  hidden_size = 150
  
  plm_pretrained_name_or_path = t5-base
  
  eval_frequency = 1000
}

t5_large = ${t5_base}{
  plm_pretrained_name_or_path = t5-large
}

flant5_base = ${t5_large}{
  plm_pretrained_name_or_path = google/flan-t5-base
}

flant5_large = ${t5_large}{
  plm_pretrained_name_or_path = google/flan-t5-large
}

t5_3b = ${t5_base}{
  plm_learning_rate = 3e-5
  task_learning_rate = 3e-4
  plm_pretrained_name_or_path = t5-3b
}
t0_3b = ${t5_3b}{
  plm_pretrained_name_or_path = bigscience/T0_3B
}
flant5_xl = ${t5_3b}{
  plm_pretrained_name_or_path = google/flan-t5-xl
}

flant5_xxl = ${t5_3b}{
  plm_pretrained_name_or_path = google/flan-t5-xxl
}