add chinese

lonePatient · Mar 25, 2020 · 5075076 · 5075076
1 parent e453a4b
commit 5075076
Show file tree

Hide file tree

Showing 24 changed files with 42,408 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -22,7 +22,10 @@ by Kevin Clark. Minh-Thang Luong. Quoc V. Le. Christopher D. Manning
 
 **English**: Official download links: [google electra](https://github.com/google-research/electra)
 
-**Chinese**: https://github.com/CLUEbenchmark/ELECTRA
+**Chinese**: 
+
+* https://github.com/CLUEbenchmark/ELECTRA
+* https://github.com/ymcui/Chinese-ELECTRA
 
 ## Fine-tuning
 
@@ -35,6 +38,7 @@ example:
 |  |  └── config.json
 |  |  └── vocab.txt
 ```
+
 2．convert electra tf checkpoint to pytorch
 ```python
 python convert_electra_tf_checkpoint_to_pytorch.py \
@@ -45,11 +49,11 @@ python convert_electra_tf_checkpoint_to_pytorch.py \
 
 Before running anyone of these GLUE/CLUE tasks you should download the [GLUE data](https://gluebenchmark.com/tasks) /[CLUE data](https://www.cluebenchmarks.com/introduce.html) by running  script named `download_xxxx_data` in the directory`tools` and unpack it to some directory $DATA_DIR.
 
-3．run `sh run_classifier_sst2.sh`to fine tuning albert model
+3．run `sh scripts/run_classifier_sst2.sh`to fine tuning albert model
 
 ## Result
 
-Performance of **electra** on GLUE benchmark results using a single-model setup on **dev**:
+Performance of **electra** on `GLUE` benchmark results using a single-model setup on **dev**:
 
 |  | Cola| Sst-2| Mnli| Sts-b|
 | :------- | :---------: | :---------: |:---------: | :---------: |
@@ -58,14 +62,20 @@ Performance of **electra** on GLUE benchmark results using a single-model setup
 | electra_base | 67.8 | 94.2 |  | 91.1 |
 | electra_large | 71.1 | 95.8 |  | 92.4 |
 
-Performance of **electra** on CLUE benchmark results using a single-model setup on **dev**:
-
+Performance of **electra** on `CLUE` benchmark results using a single-model setup on **dev**:
 
 |  | AFQMC| TNEWS | IFLYTEK |
 | :------- | :---------: | :---------: |:---------: |
 | metrics | accuracy | accuracy | accuracy |
 | electra_tiny | 69.82 | 54.48 | 56.98 |
 
+Performance of **electra** on `lcqmc and chnsenticorp` results using a single-model setup on **dev**:
+
+|  | chnsenticorp |
+| :------- | :---------: |
+| metrics | accuracy |
+| electra_small |  92.75 |
+| electra_base | 94.08 |
 
 ## pretraining
 

diff --git a/convert_electra_tf_checkpoint_to_pytorch.py b/convert_electra_tf_checkpoint_to_pytorch.py
@@ -25,9 +25,9 @@
 import logging
 logging.basicConfig(level=logging.INFO)
 
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, electra_config_file, pytorch_dump_path):
     # Initialise PyTorch model
-    config = ElectraConfig.from_pretrained(bert_config_file)
+    config = ElectraConfig.from_pretrained(electra_config_file)
     # print("Building PyTorch model from configuration: {}".format(str(config)))
     model = ElectraForPreTraining(config)
     # Load weights from tf checkpoint
@@ -63,7 +63,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
 
 '''
 python convert_electra_tf_checkpoint_to_pytorch.py \
-    --tf_checkpoint_path=./prev_trained_model/electra_tiny \
-    --electra_config_file=./prev_trained_model/electra_tiny/config.json \
-    --pytorch_dump_path=./prev_trained_model/electra_tiny/pytorch_model.bin
+    --tf_checkpoint_path=./prev_trained_model/electra_base_zh \
+    --electra_config_file=./prev_trained_model/electra_base_zh/config.json \
+    --pytorch_dump_path=./prev_trained_model/electra_base_zh/pytorch_model.bin
 '''
diff --git a/metrics/task_compute_metrics.py b/metrics/task_compute_metrics.py
@@ -87,5 +87,7 @@ def compute_metrics(task_name, preds, labels):
         return {"acc": simple_accuracy(preds, labels)}
     elif task_name == "copa":
         return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "chnsenticorp":
+        return {"acc": simple_accuracy(preds, labels)}
     else:
         raise KeyError(task_name)
diff --git a/model/modeling_electra.py b/model/modeling_electra.py
@@ -10,6 +10,7 @@
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_electra import get_generator_config
+from .modeling_utils import temperature_sampling
 import torch.nn.functional as F
 logger = logging.getLogger(__name__)
 
@@ -39,6 +40,7 @@ def load_tf_weights_in_electra(model, config, tf_checkpoint_path):
     names = []
     arrays = []
     for name, shape in init_vars:
+        # print("Loading TF weight {} with shape {}".format(name, shape))
         logger.info("Loading TF weight {} with shape {}".format(name, shape))
         array = tf.train.load_variable(tf_path, name)
         names.append(name)

diff --git a/model/modeling_utils.py b/model/modeling_utils.py
@@ -754,17 +754,9 @@ def prune_layer(layer, index, dim=None):
     else:
         raise ValueError("Can't prune layer of class {}".format(layer.__class__))
 
-def temperature_sampling(logits, temperature,do_sample=True):
-    assert temperature >=0
-    if do_sample:
-        if temperature != 1.0:
-            logits = logits / temperature
-        # Sample
-        batch_size,sequence_size,hidden_size = logits.size()
-        logits = logits.view(-1,hidden_size)
-        token_ids = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
-        token_ids = token_ids.view(batch_size,sequence_size)
-    else:
-        # Greedy decoding
-        token_ids = torch.argmax(logits, dim=-1)
-    return token_ids
+def temperature_sampling(logits, temperature):
+    if temperature is None or temperature == 0.0:
+        return torch.argmax(logits)
+    probs = F.softmax(logits / temperature)
+    pred_ids = probs.cpu().multinomial(probs.size()[1],replacement=False)
+    return pred_ids
diff --git a/prev_trained_model/electra_base_zh/config.json b/prev_trained_model/electra_base_zh/config.json
@@ -0,0 +1,15 @@
+{
+  "vocab_size": 21128,
+  "embedding_size": 768,
+  "hidden_size": 768,
+  "num_hidden_layers": 12,
+  "num_attention_heads": 12,
+  "intermediate_size": 3072,
+  "generator_size": "1/4",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "attention_probs_dropout_prob": 0.1,
+  "max_position_embeddings": 512,
+  "type_vocab_size": 2,
+  "initializer_range": 0.02
+}