From 1e8e79421019f079cdc15dd318d0c2ac2fb6335e Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 30 Aug 2024 13:58:43 +0000 Subject: [PATCH] videollava --- src/transformers/models/llama/modeling_llama.py | 1 - src/transformers/models/video_llava/modeling_video_llava.py | 4 ++++ .../test_modeling_tf_roberta_prelayernorm.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 0bc44f314b5e86..59df8df46cd9dd 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -1154,7 +1154,6 @@ def forward( Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - num_logits_to_keep (`int`, *optional*): Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 9ae80be65ae4b6..8ab0c6f9487ddf 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -426,6 +426,10 @@ def forward( Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. num_logits_to_keep (`int`, *optional*): Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all diff --git a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py index 2138541603dd5a..ecf1f321af9588 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py @@ -564,6 +564,7 @@ class TFRobertaPreLayerNormModelTest(TFModelTesterMixin, PipelineTesterMixin, un if is_tf_available() else () ) + all_generative_model_classes = () # TFRobertaPreLayerNormForCausalLM fails numerical tests pipeline_model_mapping = ( { "feature-extraction": TFRobertaPreLayerNormModel,