ADLR/megatron-lm!2432 - Small VLM example

NVIDIA · Dec 11, 2024 · 2aa3522 · 2aa3522
1 parent 215a2eb
commit 2aa3522
Show file tree

Hide file tree

Showing 15 changed files with 324 additions and 91 deletions.
diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
@@ -7,34 +7,20 @@
 
 
 def get_language_model_config(config):
-    if config.language_model_type == "2b":
+    if config.language_model_type == "llama3_8b":
+        config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
         config.bias_activation_fusion = False
         config.gated_linear_unit = True
-        config.apply_query_key_layer_scaling = True
-        config.layernorm_zero_centered_gamma = True
-        config.bias_dropout_fusion = False
-        config.rotary_percent = 0.5
-        config.apply_rope_fusion = False
-        config.attention_softmax_in_fp32 = True
-    elif config.language_model_type == "8b":
-        config.add_bias_linear = False
-        config.bias_activation_fusion = False
-        config.gated_linear_unit = False
-        config.apply_query_key_layer_scaling = True
-        config.layernorm_zero_centered_gamma = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
         config.bias_dropout_fusion = False
-        config.rotary_percent = 0.5
-        config.attention_dropout = 0.0
         config.apply_rope_fusion = False
-        config.activation_func = squared_relu
-        config.ffn_hidden_size = 16384
-        config.masked_softmax_fusion = True
         config.attention_softmax_in_fp32 = True
-        config.num_query_groups = 32
-        config.kv_channels = 128
-        config.rotary_interleaved = False
-    elif config.language_model_type == "llama3_8b":
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "mistral_7b":
         config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
         config.bias_activation_fusion = False
@@ -47,7 +33,7 @@ def get_language_model_config(config):
         config.apply_rope_fusion = False
         config.attention_softmax_in_fp32 = True
         config.ffn_hidden_size = 14336
-    elif config.language_model_type == "mistral_7b":
+    elif config.language_model_type == "yi-34b":
         config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
         config.bias_activation_fusion = False
@@ -59,10 +45,11 @@ def get_language_model_config(config):
         config.bias_dropout_fusion = False
         config.apply_rope_fusion = False
         config.attention_softmax_in_fp32 = True
-        config.ffn_hidden_size = 14336
-    elif config.language_model_type == "yi-34b":
+        config.ffn_hidden_size = 20480
+    elif config.language_model_type == "qwen2.5_7B":
         config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
+        config.add_qkv_bias = True
         config.bias_activation_fusion = False
         config.gated_linear_unit = True
         config.apply_query_key_layer_scaling = False
@@ -72,7 +59,7 @@ def get_language_model_config(config):
         config.bias_dropout_fusion = False
         config.apply_rope_fusion = False
         config.attention_softmax_in_fp32 = True
-        config.ffn_hidden_size = 20480
+        config.ffn_hidden_size = 18944
     elif config.language_model_type == "qwen2.0_72B":
         config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
@@ -168,13 +155,7 @@ def get_vision_projection_config(config, hidden_size):
     config.bias_activation_fusion = False
     config.add_bias_linear = False
     config.hidden_size = hidden_size  # Used as the vision projection output size, i.e., the input to the language model.
-    if config.language_model_type == "2b":
-        config.ffn_hidden_size = 5440
-        config.activation_func = torch.nn.functional.gelu
-    if config.language_model_type == "8b":
-        config.ffn_hidden_size = 16384
-        config.activation_func = squared_relu
-    elif config.language_model_type == "llama3_8b":
+    if config.language_model_type == "llama3_8b":
         config.ffn_hidden_size = 14336
         config.activation_func = torch.nn.functional.gelu
     elif config.language_model_type == "mistral_7b":
@@ -185,6 +166,9 @@ def get_vision_projection_config(config, hidden_size):
         config.ffn_hidden_size = 20480
         config.normalization = "LayerNorm"
         config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "qwen2.5_7B":
+        config.ffn_hidden_size = 3584
+        config.activation_func = torch.nn.functional.gelu
     elif config.language_model_type == "qwen2.0_72B":
         config.ffn_hidden_size = 29568
         config.normalization = "LayerNorm"

diff --git a/examples/multimodal/evaluate_ai2d.py b/examples/multimodal/evaluate_ai2d.py
@@ -9,19 +9,25 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="AI2D")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                results.append(
-                    {
-                        "question_id": res["sample_id"],
-                        "answer": res["answer"],
-                        "gt_answer": res["gt_answer"],
-                    }
-                )
+                sample_id = res["sample_id"]
+
+                # Ignore possible duplicates.
+                if sample_id in results:
+                    continue
+
+                results[sample_id] = {
+                    "question_id": sample_id,
+                    "answer": res["answer"],
+                    "gt_answer": res["gt_answer"],
+                }
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)

diff --git a/examples/multimodal/evaluate_chartqa.py b/examples/multimodal/evaluate_chartqa.py
@@ -9,15 +9,22 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="ChartQA")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                res["question_id"] = res["sample_id"]
+                sample_id = res["sample_id"]
 
-                results.append(res)
+                # Ignore possible duplicates.
+                if sample_id in results:
+                    continue
+
+                res["question_id"] = sample_id
+                results[sample_id] = res
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)

diff --git a/examples/multimodal/evaluate_coco.py b/examples/multimodal/evaluate_coco.py
@@ -11,20 +11,28 @@ def convert_to_coco_format(input_path):
     """Convert input files to COCO compatible format."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="captioning")
 
-    captions = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
+                sample_id = res["sample_id"]
 
-                question_id = res['sample_id']
-                caption = res['caption'].rstrip('.').lower()
+                # Ignore possible duplicates.
+                if sample_id in results:
+                    continue
 
-                captions.append({"image_id": question_id, "caption": caption})
+                caption = res["caption"].rstrip(".").lower()
+                results[sample_id] = {
+                    "image_id": sample_id,
+                    "caption": caption,
+                }
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
-        json.dump(captions, output_file, indent=4)
+        json.dump(results, output_file, indent=4)
 
     return output_file_path
 

diff --git a/examples/multimodal/evaluate_mathvista.py b/examples/multimodal/evaluate_mathvista.py
@@ -11,13 +11,21 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="MathVista")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                results.append(res)
+                sample_id = res["sample_id"]
+
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+
+                results[sample_id] = res
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)

diff --git a/examples/multimodal/evaluate_mmmu.py b/examples/multimodal/evaluate_mmmu.py
@@ -48,6 +48,10 @@ def convert_to_mmmu_format(input_path):
                     )
 
                 # MMMU eval script expects just a sample_id to prediction mapping.
+                # Skip possible duplicates.
+                if sample_id in output:
+                    continue
+
                 output[sample_id] = prediction
 
     with open(output_file_path, "w") as output_file:

diff --git a/examples/multimodal/evaluate_ocrbench.py b/examples/multimodal/evaluate_ocrbench.py
@@ -8,13 +8,21 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                results.append(res)
+                sample_id = res["sample_id"]
+
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+
+                results[sample_id] = res
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)

diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluate_textvqa.py
@@ -9,22 +9,25 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                results.append(
-                    {
-                        "question_id": res["sample_id"],
-                        "answer": res["answer"],
-                        "gt_answer": res["gt_answer"],
-                    }
-                )
-
-    # Make order deterministic.
-    # results = sorted(results, key=lambda d: d["question_id"])
+                sample_id = res["sample_id"]
+
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+
+                results[sample_id] = {
+                    "question_id": sample_id,
+                    "answer": res["answer"],
+                    "gt_answer": res["gt_answer"],
+                }
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)

diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluate_vqav2.py
@@ -9,15 +9,22 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                res["question_id"] = res["sample_id"]
+                sample_id = res["sample_id"]
 
-                results.append(res)
+                # Skip possible duplicates.
+                if sample_id in results:
+                    continue
+
+                res["question_id"] = sample_id
+                results[sample_id] = res
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)
@@ -57,6 +64,9 @@ def compute_vqa_accuracy(result_file, task):
             assert len(gt) == 1, "expected exactly one groundtruth answer."
             gt = gt[0]
 
+            pred = pred.rstrip("%")
+            gt = gt.rstrip("%")
+
             if is_number(pred) and is_number(gt):
                 pred = float(pred)
                 gt = float(gt)