diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 2025140bb6e36a..0faaa2b199542d 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -192,7 +192,7 @@ def __init__(self, config): mode = config.spatial_pool_mode stride = config.spatial_pool_stride out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size) - self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2 + self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2 if mode == "average": self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)