Merge pull request #560 from yaoguany/main

research4pan · web-flow · commit 01058298718e · 2023-07-21T17:40:42.000+08:00
allow exceeding model maximum length when train&amp;inference
diff --git a/src/lmflow/args.py b/src/lmflow/args.py
@@ -198,6 +198,14 @@ class ModelArguments:
             )
         }
     )
+    truncate_to_model_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "whether truncate the dataset to model max length."
+            )
+        }
+    )
     use_int8: bool = field(
         default=False,
         metadata={"help": "whether to load int8 quantization for inference"}
diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py
@@ -248,7 +248,9 @@ def __init__(
             # We resize the embeddings only when necessary to avoid index errors.
             # If you are creating a model from scratch on a small vocab and want a
             # smaller embedding size, remove this test.
-            embedding_size = model.get_input_embeddings().weight.shape[0]
+            with deepspeed.zero.GatheredParameters(model.get_input_embeddings().weight, modifier_rank=None):
+                weights = model.get_input_embeddings().weight
+                embedding_size = weights.shape[0]
             if len(tokenizer) > embedding_size:
                 model.resize_token_embeddings(len(tokenizer))
 
diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
@@ -323,11 +323,14 @@ def _evaluate_ppl(self, model, dataset: Dataset, verbose=True):
         texts = [ instance["text"] for instance in data_dict["instances"] ]
         encodings = model.get_tokenizer()("\n\n".join(texts), return_tensors="pt")
         # Define some constant
-        try:
-            max_length = min(model.get_backend_model().config.n_positions, model.get_max_length())
-        except:
-            max_length = min(1024, model.get_max_length())
-
+        if self.model_args.truncate_to_model_max_length:
+            try:
+                max_length = min(model.get_backend_model().config.n_positions, model.get_max_length())
+            except:
+                max_length = min(1024, model.get_max_length())
+        else:
+            max_length = self.block_size
+        
         if verbose:
             print(f"The maximum sequence length : {max_length}")
         seq_len = encodings.input_ids.size(1)
diff --git a/src/lmflow/pipeline/finetuner.py b/src/lmflow/pipeline/finetuner.py
@@ -130,14 +130,27 @@ def group_text(self, tokenized_datasets, model_max_length):
                 block_size = 1024
         else:
             if data_args.block_size > model_max_length:
-                logger.warning(
-                    f"The block_size passed ({data_args.block_size}) is larger"
-	    			f" than the maximum length for the model"
-                    f"({model_max_length})."
-                    f" Using block_size={model_max_length}."
-                )
-            block_size = min(data_args.block_size, model_max_length)
-
+                if self.model_args.truncate_to_model_max_length:        
+                    logger.warning(
+                        f"The block_size passed ({data_args.block_size}) is larger"
+                        f" than the maximum length for the model"
+                        f"({model_max_length})."
+                        f" Using block_size={model_max_length}."
+                        f"If you would like to use a longer 'block_size' that is"
+                        f" longer than the maximum length supported by the model,"
+                        f" you can override this behavior with"
+                        f"default with `--truncate_to_model_max_length False`."
+                    )
+                    block_size = model_max_length
+                else:
+                    logger.warning(
+                        f"The block_size passed ({data_args.block_size}) is larger"
+                        f"than the maximum length for the model"
+                        f"({model_max_length})."
+                        f"Using block_size={data_args.block_size}.")
+                    block_size = data_args.block_size
+            else:
+                block_size = data_args.block_size
         # Main data processing function that will concatenate all texts from
         # our dataset and generate chunks of block_size.
         def group_texts(examples):

Original file line number	Diff line number	Diff line change
`@@ -198,6 +198,14 @@ class ModelArguments:`
`198`	`198`	`)`
`199`	`199`	`}`
`200`	`200`	`)`
	`201`	`+ truncate_to_model_max_length: bool = field(`
	`202`	`+ default=True,`
	`203`	`+ metadata={`
	`204`	`+ "help": (`
	`205`	`+ "whether truncate the dataset to model max length."`
	`206`	`+ )`
	`207`	`+ }`
	`208`	`+ )`
`201`	`209`	`use_int8: bool = field(`
`202`	`210`	`default=False,`
`203`	`211`	`metadata={"help": "whether to load int8 quantization for inference"}`