Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove @slow for test_eager_matches_sdpa_inference #34558

Merged
merged 11 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(self, config):
mode = config.spatial_pool_mode
stride = config.spatial_pool_stride
out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2
self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2
ydshieh marked this conversation as resolved.
Show resolved Hide resolved

if mode == "average":
self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def __init__(self, config):
mode = config.spatial_pool_mode
stride = config.spatial_pool_stride
out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2
self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2

if mode == "average":
self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/unispeech/modeling_unispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -1577,7 +1577,7 @@ def forward(
quantized_features, codevector_perplexity = self.quantizer(extract_features)

# project quantized features twice
quantized_features = self.project_q(quantized_features)
quantized_features = self.project_q(quantized_features.to(self.project_q.weight.dtype))
quantized_features = self.project_hid(quantized_features)

prob_replace_matrix = torch.empty(transformer_features.size(0), transformer_features.size(1)).fill_(
Expand Down
20 changes: 10 additions & 10 deletions tests/generation/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,16 @@ def _get_logits_processor_kwargs(self, do_sample=False, config=None):
# This is a band-aid for VLM models, to ensure they don't generate image/video tokens which would cause them
# to crash. On pretrained models this isn't a risk, as they are trained to not generate these tokens.
if config is not None:
image_token_index = (
config.image_token_index
if getattr(config, "image_token_index", None) is not None
else getattr(config, "image_token_id", None)
)
video_token_index = config.video_token_index if hasattr(config, "video_token_index") else None
if image_token_index is not None and image_token_index < config.get_text_config().vocab_size:
logits_processor_kwargs["bad_words_ids"].append([image_token_index])
if video_token_index is not None and video_token_index < config.get_text_config().vocab_size:
logits_processor_kwargs["bad_words_ids"].append([video_token_index])
for key in [
"image_token_index",
"image_token_id",
"video_token_index",
"video_token_id",
"vision_start_token_id",
]:
token_index = getattr(config, key, None)
if token_index is not None and token_index < config.get_text_config().vocab_size:
logits_processor_kwargs["bad_words_ids"].append([token_index])

return logits_processor_kwargs

Expand Down
9 changes: 8 additions & 1 deletion tests/models/albert/test_modeling_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@
import unittest

from packaging import version
from parameterized import parameterized

from transformers import AlbertConfig, AutoTokenizer, is_torch_available
from transformers.models.auto import get_values
from transformers.testing_utils import require_torch, slow, torch_device
from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device

from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
Expand Down Expand Up @@ -288,6 +289,12 @@ def setUp(self):
self.model_tester = AlbertModelTester(self)
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@unittest.skip("Albert requires `head_mask` which is currently not done in this test.")
def test_eager_matches_sdpa_inference(self):
pass
Comment on lines +292 to +296
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On skips like this, on Albert and other models: the test pulls the main input and the attention mask to manipulate them, finally sending them to the model. We could pop these items from inputs_dict, and then pass **inputs_dict to the model (e.g. model_eager(**prepared_inputs, **inputs_dict)) -- I think then we wouldn't need to skip tests due to missing inputs 🤗

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe let's merge and do this in a follow-up PR. It's never worked before.


def test_config(self):
self.config_tester.run_common_tests()

Expand Down
302 changes: 0 additions & 302 deletions tests/models/glm/test_modeling_glm.py

Large diffs are not rendered by default.

10 changes: 0 additions & 10 deletions tests/models/granite/test_modeling_granite.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
require_read_token,
require_torch,
require_torch_gpu,
require_torch_sdpa,
slow,
torch_device,
)
Expand Down Expand Up @@ -445,15 +444,6 @@ def test_use_flash_attention_2_true(self):
if not has_flash:
raise ValueError("The flash model should have flash attention layers")

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
"""
skipping the test since mup is very flaky and gets consistently different outputs
"""
self.skipTest("skipping the test since mup is very flaky and gets consistently different outputs")


@require_torch_gpu
class GraniteIntegrationTest(unittest.TestCase):
Expand Down
10 changes: 0 additions & 10 deletions tests/models/granitemoe/test_modeling_granitemoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
require_read_token,
require_torch,
require_torch_gpu,
require_torch_sdpa,
slow,
torch_device,
)
Expand Down Expand Up @@ -444,15 +443,6 @@ def test_use_flash_attention_2_true(self):
if not has_flash:
raise ValueError("The flash model should have flash attention layers")

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
"""
skipping the test since mup is very flaky and gets consistently different outputs
"""
self.skipTest("skipping the test since mup is very flaky and gets consistently different outputs")


@require_torch_gpu
class GraniteMoeIntegrationTest(unittest.TestCase):
Expand Down
15 changes: 13 additions & 2 deletions tests/models/idefics/test_modeling_idefics.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def __init__(
num_attention_heads=self.vision_num_attention_heads,
num_hidden_layers=self.vision_num_hidden_layers,
intermediate_size=self.vision_intermediate_size,
)
).to_dict()

self.perceiver_qk_layer_norms_perceiver = perceiver_qk_layer_norms_perceiver
self.perceiver_resampler_depth = perceiver_resampler_depth
Expand Down Expand Up @@ -316,7 +316,6 @@ def prepare_pixel_values(self):
return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])

@require_torch_sdpa
@slow
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this test dont need skip anymore since we dont check if model has SDPA layers within this test anymore. But it can be skipped due to the same flakiness

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It still has input preparation issues, where I added below to another model test class

"Idefics requires both text and image inputs which is currently not done in this test."

As mentioned in a reply to Joao's comment, let's try to do it in a follow up PR

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
Expand Down Expand Up @@ -353,6 +352,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):

return inputs_dict

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.")
ydshieh marked this conversation as resolved.
Show resolved Hide resolved
def test_eager_matches_sdpa_inference(self):
pass

def test_model_outputs_equivalence(self):
try:
orig = self.all_model_classes
Expand Down Expand Up @@ -602,6 +607,12 @@ def setUp(self):
)
self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)

@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.")
def test_eager_matches_sdpa_inference(self, torch_dtype):
pass

@pytest.mark.generate
def test_left_padding_compatibility(self):
"""Overwrite because IDEFICS needs image attention mask to be also padded"""
Expand Down
8 changes: 4 additions & 4 deletions tests/models/llava_next/test_modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(
},
is_training=True,
vision_config={
"image_size": 16,
"image_size": 8,
"patch_size": 4,
"num_channels": 3,
"is_training": True,
Expand Down Expand Up @@ -123,10 +123,10 @@ def __init__(
self.batch_size = 3
self.num_channels = 3
self.image_size = 30
self.encoder_seq_length = 95
self.image_grid_pinpoints = [[32, 32]]
self.num_image_tokens = 88
self.image_grid_pinpoints = [[16, 16]]
self.num_image_tokens = 24
self.seq_length = seq_length + self.num_image_tokens
self.encoder_seq_length = self.seq_length

def get_config(self):
return LlavaNextConfig(
Expand Down
10 changes: 5 additions & 5 deletions tests/models/llava_next_video/test_modeling_llava_next_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def __init__(
},
is_training=True,
vision_config={
"image_size": 16,
"image_size": 8,
"patch_size": 4,
"num_channels": 3,
"is_training": True,
Expand Down Expand Up @@ -125,10 +125,10 @@ def __init__(
self.batch_size = 3
self.num_channels = 3
self.image_size = 30
self.encoder_seq_length = 127
self.image_grid_pinpoints = [[32, 32]]
self.num_image_tokens = 88
self.num_video_tokens = 32

self.image_grid_pinpoints = [[16, 16]]
self.num_image_tokens = 24
self.num_video_tokens = 8
self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens

def get_config(self):
Expand Down
84 changes: 34 additions & 50 deletions tests/models/mimi/test_modeling_mimi.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,10 +409,14 @@ def test_identity_shortcut(self):
config.use_conv_shortcut = False
self.model_tester.create_and_check_model_forward(config, inputs_dict)

# Overwrite to use `audio_values` as the tensors to compare.
# TODO: Try to do this in the parent class.
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
ydshieh marked this conversation as resolved.
Show resolved Hide resolved
if torch_dtype == "float16" and torch_device == "cpu":
self.skipTest("`replication_pad1d` not implemented for 'Half")

if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")

Expand Down Expand Up @@ -513,7 +517,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
if not (self.has_attentions and can_output_attn) and output_attentions:
continue
for batch_size in [1, 5]:
for batch_size in [7]:
dummy_input = inputs_dict[model.main_input_name]

if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
Expand Down Expand Up @@ -564,11 +568,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):

dummy_attention_mask[:] = 1
if padding_side == "left":
dummy_attention_mask[-1, :-1] = 1
dummy_attention_mask[-1, -4:] = 0
dummy_attention_mask[-1, :2] = 0
dummy_attention_mask[-1, 2:] = 1
elif padding_side == "right":
dummy_attention_mask[-1, 1:] = 1
dummy_attention_mask[-1, :3] = 0
dummy_attention_mask[-1, -2:] = 0
dummy_attention_mask[-1, :-2] = 1

for enable_kernels in [False, True]:
failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
Expand Down Expand Up @@ -655,52 +659,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):

# Masked tokens output slightly deviates - we don't mind that.
if use_mask:
_logits_sdpa = torch.zeros_like(input=logits_sdpa)
_logits_eager = torch.zeros_like(input=logits_eager)

_logits_sdpa[:-1] = logits_sdpa[:-1]
_logits_eager[:-1] = logits_eager[:-1]

if padding_side == "left":
sub_sdpa = logits_sdpa[:-1]
sub_eager = logits_eager[:-1]
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
fail_cases.append(
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
)

sub_sdpa = logits_sdpa[-1, :-4]
sub_eager = logits_eager[-1, :-4]
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
fail_cases.append(
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
)

# Testing the padding tokens is not really meaningful but anyway
# sub_sdpa = logits_sdpa[-1, -4:]
# sub_eager = logits_eager[-1, -4:]
# if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
# fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
elif padding_side == "right":
sub_sdpa = logits_sdpa[:-1]
sub_eager = logits_eager[:-1]
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
fail_cases.append(
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
)

sub_sdpa = logits_sdpa[-1, 3:]
sub_eager = logits_eager[-1, 3:]
if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
fail_cases.append(
get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
)

# Testing the padding tokens is not really meaningful but anyway
# sub_sdpa = logits_sdpa[-1, :3]
# sub_eager = logits_eager[-1, :3]
# if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
# fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
_logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
_logits_eager[-1:, 2:] = logits_eager[-1:, 2:]

else:
if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
fail_cases.append(
get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
)
elif padding_side == "right":
_logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
_logits_eager[-1:, 2:] = logits_eager[-1:, :-2]

logits_sdpa = _logits_sdpa
logits_eager = _logits_eager

results = [
torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
]
# If 80% batch elements have matched results, it's fine
if np.mean(results) < 0.8:
fail_cases.append(
get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
)

self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))

Expand Down
9 changes: 0 additions & 9 deletions tests/models/mllama/test_modeling_mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,10 @@
from transformers.models.mllama.configuration_mllama import MllamaTextConfig
from transformers.testing_utils import (
cleanup,
is_flaky,
require_bitsandbytes,
require_read_token,
require_torch,
require_torch_gpu,
require_torch_sdpa,
slow,
torch_device,
)
Expand Down Expand Up @@ -354,13 +352,6 @@ def _check_attentions_for_generate(

self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes)

@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_inference_1_bfloat16(self):
# A workaround to override parametrized test with flaky decorator
super().test_eager_matches_sdpa_inference_1_bfloat16()

@unittest.skip("For some unknown reasons the tests fails in CrossAttention layer when doing torch.sdpa(). ")
def test_sdpa_can_compile_dynamic(self):
pass
Expand Down
Loading
Loading