Skip to content

Commit

Permalink
ADLR/megatron-lm!2306 - NVLM example scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
trintamaki committed Nov 16, 2024
1 parent 63b8520 commit 4131b07
Show file tree
Hide file tree
Showing 17 changed files with 1,395 additions and 28 deletions.
2 changes: 1 addition & 1 deletion examples/multimodal/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ python examples/multimodal/model_converter/clip_converter.py --download-root /so
Update the paths to point to the mcore converted CLIP and Mistral models and run the following script to combine the Mistral and CLIP models into a single multimodal checkpoint folder:

```
examples/multimodal/combine_mistral_clip.sh /path/to/mistral/model /path/to/clip/model /output/dir
examples/multimodal/combine_lm_vision_checkpoints.sh /path/to/mistral/model /path/to/clip/model /output/dir
```

## Training
Expand Down
57 changes: 57 additions & 0 deletions examples/multimodal/combine_lm_vision_checkpoints.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#/bin/bash
MCORE_LM=$1 # <path_to_mcore_lm_model_folder>
MCORE_VISION=$2 # <path_to_mcore_vision_model_folder>
OUTPUT_DIR=$3 # <path_to_output_folder_for_combined_checkpoint>
MODEL_TYPE=$4 # Model type. Default: Mistral CLIP example.

if [[ $MODEL_TYPE == "nvlm" ]]; then
# NVLM TP=8
python examples/multimodal/combine_state_dicts.py \
--input \
${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_04/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_04/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_05/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_05/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_06/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_06/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_07/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_07/model_optim_rng.pt \
--prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
--output \
${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_04/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_05/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_06/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_07/model_optim_rng.pt
else
# Mistral CLIP example TP=4.
python examples/multimodal/combine_state_dicts.py \
--input \
${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
--prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
--output \
${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt
fi

echo 1 > ${OUTPUT_DIR}/latest_checkpointed_iteration.txt
23 changes: 0 additions & 23 deletions examples/multimodal/combine_mistral_clip.sh

This file was deleted.

19 changes: 18 additions & 1 deletion examples/multimodal/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,20 @@ def get_language_model_config(config):
config.apply_rope_fusion = False
config.attention_softmax_in_fp32 = True
config.ffn_hidden_size = 20480
elif config.language_model_type == "qwen2.0_72B":
config.activation_func = torch.nn.functional.silu
config.add_bias_linear = False
config.add_qkv_bias = True
config.bias_activation_fusion = False
config.gated_linear_unit = True
config.apply_query_key_layer_scaling = False
config.layernorm_zero_centered_gamma = (
False # Zero centered gamma not supported for RMSNorm
)
config.bias_dropout_fusion = False
config.apply_rope_fusion = False
config.attention_softmax_in_fp32 = True
config.ffn_hidden_size = 29568
else:
raise ValueError(f"unknown language model type {config.language_model_type}")

Expand Down Expand Up @@ -146,7 +160,6 @@ def get_vision_model_config(config, apply_query_key_layer_scaling):
else:
raise ValueError(f"unknown vision model type {config.vision_model_type}")


return config


Expand All @@ -171,6 +184,10 @@ def get_vision_projection_config(config, hidden_size):
config.ffn_hidden_size = 20480
config.normalization = 'LayerNorm'
config.activation_func = torch.nn.functional.gelu
elif config.language_model_type == "qwen2.0_72B":
config.ffn_hidden_size = 29568
config.normalization = 'LayerNorm'
config.activation_func = torch.nn.functional.gelu
else:
raise ValueError(f"unknown language model type {config.language_model_type}")

Expand Down
Empty file modified examples/multimodal/model_converter/internvit_converter.py
100644 → 100755
Empty file.
6 changes: 3 additions & 3 deletions examples/multimodal/model_converter/siglip_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ def add_chunck_tensor(new_tensor, new_name, chunk_dim=None):
head_dim = 72
num_head = 16
for layer_idx in range(27):
origin_base = f"vision_tower.vision_model.encoder.layers.{layer_idx}"
origin_base = f"vision_tower.vision_model.encoder.layers.{layer_idx}"
target_base = f"decoder.layers.{layer_idx}"

for param_type in ["weight", "bias"]:
# QKV
q_proj_params = state_dict[f"{origin_base}.self_attn.q_proj.{param_type}"]
Expand Down Expand Up @@ -135,7 +135,7 @@ def add_chunck_tensor(new_tensor, new_name, chunk_dim=None):
Example usage:
python siglip_converter.py --tensor-parallel-size 4 --output google_paligemma_3b_pt_44_mcore_tp_4 --use-te
examples/multimodal/combine_mistral_clip.sh /lustre/fsw/portfolios/llmservice/users/jbarker/workspace/checkpoints/Mistral-7B-Instruct-v0.3-mcore-tp4 google_paligemma_3b_pt_44_mcore_tp_4 mistral_7b_instruct_v0p3_google_paligemma_3b_pt_44_mcore_tp_4
examples/multimodal/combine_mistral_clip.sh Mistral-7B-Instruct-v0.3-mcore-tp4 google_paligemma_3b_pt_44_mcore_tp_4 mistral_7b_instruct_v0p3_google_paligemma_3b_pt_44_mcore_tp_4
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
Expand Down
5 changes: 5 additions & 0 deletions examples/multimodal/nvlm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
NVLM
====

Work in progress.
Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
165 changes: 165 additions & 0 deletions examples/multimodal/nvlm/nvlm_prompts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
{
"COMMENT": "Mixture of our own custom prompts and some prompts from https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
"Captioning": {
"raw": [
"Can you briefly explain what you see in the image?",
"Describe what's happening in this image in one short sentence.",
"Write a short caption that accurately represents the content of this image.",
"Please generate a descriptive caption for the image provided.",
"How would you summarize the scene depicted in the picture in short?",
"Describe the image briefly.",
"Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
"Create a concise caption that accurately describes the main elements in the image provided.",
"Write a brief, yet comprehensive, description of the image.",
"Describe the image in a clear and concise manner.",
"For the given image, provide a one-sentence summary that captures the most important details.",
"Generate a short caption for the picture.",
"Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
"Provide a concise and informative caption for the image, focusing on the primary subjects.",
"Write a clear description of the image, make sure the key features are well covered.",
"Offer a succinct explanation of the picture presented."
]
},
"CaptioningPretraining": {
"raw": [
"Give a brief description of image.",
"Give a brief description of the image.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely.",
"Generate a clear and concise summary of the photo."
]
},
"CaptioningSFT": {
"raw": [
"Give a brief description of the image.",
"Give a short and clear explanation of the subsequent image.",
"Present a compact description of the photo's key features.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Render a clear and concise summary of the photo.",
"Share a concise interpretation of the image provided.",
"Summarize the visual content of the image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely."
]
},
"VQAPretraining": {
"raw": [
"Question: {} Short answer:",
"Question: {} Answer:"
]
},
"VQASFT": {
"raw": [
"{}",
"{}\nAnswer the question using a single word or phrase."
],
"docvqa": [
"{}",
"{}\nAnswer this question using the text in the image directly."
]
},
"DocPretraining": {
"raw": [
"Retrieve the text from the given pdf image.",
"Extract the text from the provided document.",
"Transcribe the text displayed in the image."
],
"ocr_multi": [
"Apply grounded Optical Character Recognition (OCR) to the provided image.",
"Extract all texts and their bounding boxes from the given image using grounded OCR.",
"Extract and transcribe all visible text from the provided image, ensuring accurate spatial recognition.",
"Conduct a detailed optical character recognition analysis on this image, maintaining the text's original layout and positioning.",
"Execute a thorough text recognition procedure on this visual input, ensuring that the spatial arrangement of the text is accurately represented.",
"Perform an in-depth OCR scan of the image, capturing both the content and contextual positioning of all textual information.",
"OCR with grounding:"
],
"md": [
"Extract the text from the given image and format it in Markdown.",
"Convert the text from the provided image into Markdown format.",
"Transform the text from the given image into Markdown syntax.",
"Extract and convert the text from the image to Markdown.",
"Retrieve the text from the image and present it in Markdown format."
],
"grounded_ocr": [
"{}. Text:",
"Recognize the text in this region: {}.",
"Identify the text in this area: {}.",
"Detect the text within this section: {}."
],
"referring_grounding": [
"Region of \"{}\" is:",
"Locate the text \"{}\" in the image.",
"Identify the text \"{}\" in the image and provide the coordinates."
]
},
"CaptioningDetailed": {
"raw": [
"Create a comprehensive paragraph that captures the essence of the image while weaving a cohesive narrative around its elements.",
"Compose a paragraph that thoroughly describes the image's content, providing context and connections between different aspects of the scene.",
"Provide a detailed, paragraph-length description of the image that paints a vivid picture and tells a coherent story.",
"Write a rich and engaging paragraph that delves into the image's components, describing not only what is seen but also how the elements relate to one another.",
"Give a well-rounded, paragraph-length explanation of the image, describing the scene and its components while forming a complete and engaging narrative.",
"Produce a paragraph that not only describes the individual elements in the image but also weaves them together to form a cohesive, connected account.",
"Construct a paragraph that captures the image's details and context, offering a more in-depth and engaging story than a simple caption.",
"Compose a descriptive paragraph that brings the image to life through detailed storytelling, connecting the various visual elements into a unified narrative.",
"Create a paragraph that provides an extensive and interconnected description of the image, ensuring that the narrative is both detailed and cohesive.",
"Write a compelling and detailed paragraph that delves into the image's components, linking them together to create a unified and engaging story."
]
},
"OCR": {
"raw": [
"Can you read the text from image and output here?",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
],
"markdown": [
"Can you extract all visible text from the provided image?",
"Converting the text embedded in this image into a readable markdown document.",
"Can you read the text in the document as markdown?",
"Transcribe the document as markdown.",
"Extract and document the text from the provided image."
],
"table_markdown": [
"Can you extract all visible text from the provided table?",
"Can you read the text in the provided table as markdown?",
"Transcribe the table as markdown.",
"Extract and document the text from the provided table image."
],
"plain": [
"Transcribe the document as plain text.",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
],
"bbox_plain": [
"Transcribe the document as plain text along with bounding boxes.",
"Extract and document the text from the provided image along with bounding boxes.",
"Converting the text embedded in this image into a readable documen along with bounding boxes.",
"Can you extract all visible text with bounding boxes from the image here?"
]
},
"VQA": {
"raw": [
"Given the image, answer the following question with few words.",
"Answer the following question: ",
"What is the answer to this question?",
"Write the answer: ",
"Please answer this question: "
]
},
"Embedded": {
"raw": [
"Given the image, answer the following question with few words.",
"Answer the following question: ",
"What is the answer to this question?",
"Write the answer: ",
"Please answer this question: "
]
}
}
Loading

0 comments on commit 4131b07

Please sign in to comment.