huggingface · dacorvo · Mar 3, 2025 · Mar 3, 2025 · Mar 3, 2025 · Mar 3, 2025
@@ -22,14 +22,14 @@ jobs:
     uses: ./.github/workflows/python-quality.yml
   test-ubuntu-cuda:
     needs: [check-commits, python-quality]
-    runs-on: 
+    runs-on:
       group: aws-g5-4xlarge-plus
     strategy:
       fail-fast: false
       matrix:
-        cuda-version: ["11.8", "12.1", "12.4"]
+        cuda-version: ["11.8", "12.4", "12.6"]
     container:
-      image: pytorch/pytorch:2.4.0-cuda${{ matrix.cuda-version }}-cudnn9-devel
+      image: pytorch/pytorch:2.6.0-cuda${{ matrix.cuda-version }}-cudnn9-devel
       options: --gpus 0
 
     steps:

@@ -22,14 +22,14 @@ jobs:
     uses: ./.github/workflows/python-quality.yml
   run-examples:
     needs: [check-commits, python-quality]
-    runs-on: 
+    runs-on:
       group: aws-g5-4xlarge-plus
     strategy:
       fail-fast: false
       matrix:
         device: ["cpu", "cuda"]
     container:
-      image: pytorch/pytorch:2.4.0-cuda12.1-cudnn9-devel
+      image: pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
       options: --gpus 0
 
     steps:

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.2.6dev"
+__version__ = "0.2.7dev"
 
 from .calibrate import *
 from .library import *

@@ -5,3 +5,5 @@ To add a new implementation for an operation defined in `library./ops.py`:
 - add the corresponding `.mm` file to the list of sources in `__init__.py`,
 - add a binding to `pybind_module.cpp`,
 - provide an implementation calling the binding in `__init__.py`.
+
+Note: torch JIT extensions for MPS requires the xcode command-line tools.
@@ -16,7 +16,6 @@
 from copy import copy
 
 import torch
-from packaging import version
 from torch.utils import _pytree as pytree
 
 
@@ -27,7 +26,6 @@ class TinyGemmPackedTensor(torch.Tensor):
     @staticmethod
     def __new__(cls, data, size, stride, requires_grad=False):
         # TinyGemmPackedTensor represents uint8 data and can therefore NEVER require gradient
-        assert data.dtype == torch.int32
         assert requires_grad is False
         return torch.Tensor._make_wrapper_subclass(
             cls, size, strides=stride, dtype=torch.uint8, device=data.device, requires_grad=requires_grad
@@ -54,11 +52,11 @@ def pack(cls, t):
         """
         inner_ktiles = 2
         t = t.to(torch.int32).contiguous()
-        if version.parse(torch.__version__).release >= version.parse("2.5.0").release:
+        if t.device.type == "cpu":
+            data = torch._convert_weight_to_int4pack_for_cpu(t, innerKTiles=inner_ktiles)
+        else:
             t_uint8 = (t[::, ::2] << 4 | t[::, 1::2]).to(torch.uint8)
             data = torch._convert_weight_to_int4pack(t_uint8, innerKTiles=inner_ktiles)
-        else:
-            data = torch._convert_weight_to_int4pack(t, innerKTiles=inner_ktiles)
         # We need to store size and stride to make sure the unpacked data has the correct shape
         return TinyGemmPackedTensor(data, t.size(), t.stride())
 
@@ -86,7 +84,10 @@ def unpack(self):
         id_scale_and_shift[:, :, 1] = 8
 
         identity = torch.eye(in_features, dtype=torch.bfloat16, device=self.device)
-        unpacked_data = torch._weight_int4pack_mm(identity, self._data, group_size, id_scale_and_shift)
+        if self._data.device.type == "cpu":
+            unpacked_data = torch._weight_int4pack_mm_for_cpu(identity, self._data, group_size, id_scale_and_shift)
+        else:
+            unpacked_data = torch._weight_int4pack_mm(identity, self._data, group_size, id_scale_and_shift)
 
         return unpacked_data.t().to(torch.uint8)
 

@@ -48,9 +48,14 @@ def forward(ctx, input, other, bias):
         in_features = input.shape[-1]
         out_features = other.shape[0]
         output_shape = input.shape[:-1] + (out_features,)
-        output = torch._weight_int4pack_mm(
-            input.reshape(-1, in_features), other._data._data, other._group_size, other._scale_shift
-        )
+        if input.device.type == "cpu":
+            output = torch._weight_int4pack_mm_for_cpu(
+                input.reshape(-1, in_features), other._data._data, other._group_size, other._scale_shift
+            )
+        else:
+            output = torch._weight_int4pack_mm(
+                input.reshape(-1, in_features), other._data._data, other._group_size, other._scale_shift
+            )
         output = output.reshape(output_shape)
         if bias is not None:
             output = output + bias

@@ -19,7 +19,7 @@ authors = [{ name = 'David Corvoysier' }]
 maintainers = [
     {name = "HuggingFace Inc. Special Ops Team", email="[email protected]"},
 ]
-dependencies = ['torch>=2.4.0', 'ninja', 'numpy', 'safetensors', 'huggingface_hub']
+dependencies = ['torch>=2.6.0', 'ninja', 'numpy', 'safetensors', 'huggingface_hub']
 license = { text = 'Apache-2.0' }
 readme = 'README.md'
 dynamic = ['version']
@@ -66,4 +66,4 @@ lint.select = ['C', 'E', 'F', 'I', 'W']
 # isort configuration (to sort imports)
 [tool.ruff.lint.isort]
 lines-after-imports = 2
-known-first-party = ['optimum.quanto']
+known-first-party = ['optimum.quanto']
@@ -27,7 +27,7 @@ def compile_for_device(f, device):
     return torch.compile(f, backend=backend)
 
 
-@torch_min_version("2.6.0")
+@torch_min_version("2.7.0")
 @pytest.mark.parametrize("input_shape", [(2, 10), (10, 32, 32)])
 @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"])
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"])
@@ -48,7 +48,6 @@ def f(x, qtype):
     assert qa.axis is None
 
 
-@torch_min_version("2.3.0")
 def test_compile_qtensor_to(device):
     input_shape = (10, 32, 32)
     a = random_tensor(input_shape).to(device)

@@ -44,7 +44,7 @@ def test_packed_tensor_serialization(bits, device):
     b = io.BytesIO()
     torch.save(packed, b)
     b.seek(0)
-    packed_reloaded = torch.load(b)
+    packed_reloaded = torch.load(b, weights_only=False)
     assert isinstance(packed_reloaded, PackedTensor)
     assert packed_reloaded.shape == packed.shape
     assert packed_reloaded.dtype == packed.dtype

@@ -28,7 +28,7 @@ def test_weight_qbits_tensor_serialization(qtype, axis):
     b = io.BytesIO()
     torch.save(qa, b)
     b.seek(0)
-    qa_reloaded = torch.load(b)
+    qa_reloaded = torch.load(b, weights_only=False)
     assert isinstance(qa_reloaded, WeightQBitsTensor)
     assert qa_reloaded.qtype == qa.qtype
     assert qa_reloaded.dtype == qa.dtype

@@ -30,7 +30,7 @@ def test_weights_qbytes_tensor_serialization(input_shape, qtype, dtype, axis):
     b = io.BytesIO()
     torch.save(qinputs, b)
     b.seek(0)
-    qinputs_reloaded = torch.load(b)
+    qinputs_reloaded = torch.load(b, weights_only=False)
     assert qinputs_reloaded.qtype == qtype
     assert torch.equal(qinputs_reloaded._scale, qinputs._scale)
     if qtype.is_floating_point: