intel · nezda · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023 · Dec 28, 2023
diff --git a/docker/README.md b/docker/README.md
@@ -24,7 +24,7 @@ docker compose build
 OR
 ```bash
 docker pull intel/ai-tools:itrex-1.3.0
-docker pull intel/ai-tools:itrex-devel-1.3.0
+docker pull intel/ai-tools:itrex-1.3.0-devel
 ```
 
 ## Use Docker Image

diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py
@@ -601,7 +601,7 @@ def forward(self, sequence_output, pooled_output):
 
 class BertPreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
+        a simple interface for downloading and loading pretrained models.
     """
     def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
@@ -663,7 +663,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_d
                     . `model.chkpt` a TensorFlow checkpoint
             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
             *inputs, **kwargs: additional input for the specific Bert class
                 (ex: num_labels for BertForSequenceClassification)
         """

diff --git a/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_utils.h b/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_utils.h
@@ -366,7 +366,7 @@ inline float get_mxfp_maxnorm(const JBLAS_DTYPE t, int ebits, int mantissa_bits)
   return max_norm;
 }
 
-#ifndef _WIN32
+#if !defined(_WIN32) && !defined(__APPLE__)
 static void request_perm_xtile_data() {
   unsigned long bitmask;
   long rc;

diff --git a/intel_extension_for_transformers/llm/library/jblas/jblas/kernel_jit.h b/intel_extension_for_transformers/llm/library/jblas/jblas/kernel_jit.h
@@ -1228,7 +1228,7 @@ class PaddingTransInterleaveCvt : protected xbyak::JitAvx512f {
     jmp(ptr[reg_tmp + reg_tmp2 * sizeof(void*)], T_NEAR);  // switch(rows-iterrow) ...
     align(sizeof(intptr_t));
     L(l_tail_tbl);
-    db(reinterpret_cast<uintptr_t>(nullptr), sizeof(intptr_t));  // case 0 should never occur
+    db(reinterpret_cast<uint64_t>(nullptr), sizeof(intptr_t));  // case 0 should never occur
     for (int i = 1; i < trans_cell; ++i) putL(l_tail_case[i]);
 
     for (int m_tail = 1; m_tail < trans_cell; ++m_tail) {  // case (m_tail):

diff --git a/intel_extension_for_transformers/llm/runtime/graph/__init__.py b/intel_extension_for_transformers/llm/runtime/graph/__init__.py
@@ -65,7 +65,7 @@ def __import_package(self, model_type):
         elif model_type == "mistral":
             import intel_extension_for_transformers.llm.runtime.graph.mistral_cpp as cpp_model
         else:
-            raise TypeError("Unspported model type {}!".format(model_type))
+            raise TypeError("Unsupported model type {}!".format(model_type))
         self.module = cpp_model
 
     @staticmethod
@@ -134,7 +134,7 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
             import platform
             sys_platform = platform.platform().lower()
             if threads is None:
-                if "windows" in sys_platform:
+                if "windows" in sys_platform or "macos" in sys_platform:
                     cpu_count = os.cpu_count()
                     generate_kwargs["threads"] = int(cpu_count)
                 else:
@@ -212,7 +212,7 @@ def eos_token_id(self):
         if self.model_type == 'qwen':
             return self.tokenizer.special_tokens['<|endoftext|>']
         return self.tokenizer.eos_token_id
-    
+
     def pad_token_id(self):
         if self.tokenizer.pad_token_id == None:
             if self.batch_size == 1:

diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/CMakeLists.txt b/intel_extension_for_transformers/llm/runtime/graph/core/CMakeLists.txt
@@ -31,7 +31,7 @@ else ()
   target_link_libraries(ne_layers PUBLIC Threads::Threads jblas::jblas ne_vec)
 endif()
 
-if(NOT WIN32)
+if(NOT WIN32 AND NOT APPLE)
   target_link_libraries(ne_layers PUBLIC rt)
 endif()
 
@@ -54,7 +54,7 @@ function(add_test_target src)
   target_link_options(${test_target} PRIVATE -fsanitize=address)
   target_include_directories(${test_target} PUBLIC .)
   target_link_libraries(${test_target} PUBLIC Threads::Threads jblas::jblas ne_vec)
-  if(NOT WIN32)
+  if(NOT WIN32 AND NOT APPLE)
     target_link_libraries(${test_target} PUBLIC rt)
   endif()
   add_test(NAME ${test_target} COMMAND ${test_target})

diff --git a/intel_extension_for_transformers/llm/runtime/graph/developer_document.md b/intel_extension_for_transformers/llm/runtime/graph/developer_document.md
@@ -79,8 +79,8 @@ graph LR;
 
 We need to implement corresponding serialization methods from pytorch format, which is mainly divided into the following three steps.
 
-## 1.1.	Hyperparamters
-The term **"hyperparamters"** describes a value that is used to configure the behavior of a large language model; this is in contrast to the model's parameters, which are the weight that were derived in the training process that was used to create the model. Each model defines its own hyperparameter structure that defines the hyperparameter values accepted by that model. Valid ITREX graph files must list these values in the correct order, and each value must be represented using the correct data type. Although hyperparameters are different across models, some attributes appear in the hyperparameters for most models:
+## 1.1.	Hyperparameters
+The term **"hyperparameters"** describes a value that is used to configure the behavior of a large language model; this is in contrast to the model's parameters, which are the weight that were derived in the training process that was used to create the model. Each model defines its own hyperparameter structure that defines the hyperparameter values accepted by that model. Valid ITREX graph files must list these values in the correct order, and each value must be represented using the correct data type. Although hyperparameters are different across models, some attributes appear in the hyperparameters for most models:
 - n_vocab: the size of the model's vocabulary
 - n_embd: the size of the model's " embedding layer", which is used during prompt ingestion.
 - n_layer: the number of layers in the model; each layer represents a set of weights.
@@ -328,7 +328,7 @@ Most of our model examples only support single prompt processing. You need to ad
 +set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 +target_link_libraries(${TARGET} PUBLIC ne_layers jblas::jblas)
 ```
- and and new_model to [models_CMakeLists.txt](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/llm/runtime/graph/models/CMakeLists.txt).
+ and new_model to [models_CMakeLists.txt](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/llm/runtime/graph/models/CMakeLists.txt).
  ```diff
 add_subdirectory(opt)
 add_subdirectory(bloom)

diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/util.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/util.cpp
@@ -33,11 +33,11 @@ int32_t get_num_physical_cores() {
 #elif defined(__APPLE__) && defined(__MACH__)
   int32_t num_physical_cores;
   size_t len = sizeof(num_physical_cores);
-  int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+  int result = syscall(SYS_sysctlbyname, "hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
   if (result == 0) {
     return num_physical_cores;
   }
-  result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+  result = syscall(SYS_sysctlbyname, "hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
   if (result == 0) {
     return num_physical_cores;
   }

diff --git a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py
@@ -165,7 +165,7 @@ def guessed(model: 'LazyModel') -> 'Params':
             n_mult=256,
             n_head=n_embd // 128,
             n_head_kv=n_embd // 128,
-            f_norm_eps=1e-5,
+            rms_norm_eps=1e-5,
             n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model),
         )
 
@@ -203,7 +203,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
         )
 
     # LLaMA v2 70B params.json
-    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, 
+    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8,
     #  "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
     @staticmethod
     def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
@@ -230,8 +230,8 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
             n_head=n_head,
             n_head_kv=n_head_kv,
             ffn_hidden_size=ffn_hidden_size,
-            bos_token_id = bos_token_id,
-            eos_token_id = eos_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
         )
 
     @staticmethod
@@ -278,7 +278,7 @@ def __init__(self, fname_tokenizer: Path, params_vocab_size: int, fname_added_to
     def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
         tokenizer = self.sentencepiece_tokenizer
         for i in range(self.params_vocab_size):
-            text: bytes           
+            text: bytes
             if i < tokenizer.vocab_size():
                 if tokenizer.is_unknown(i):
                     text = " \u2047 ".encode("utf-8")
@@ -1086,7 +1086,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
         self.fout.write(struct.pack("f", params.rope_theta))
         self.fout.write(struct.pack("f", params.rope_scale))
 
-        # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json 
+        # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
         # but bos_token_id = 1 in llama.cpp
         self.fout.write(struct.pack("i", params.bos_token_id))
         self.fout.write(struct.pack("i", params.eos_token_id))
@@ -1108,10 +1108,9 @@ def write_vocab(self, vocab: Vocab) -> None:
 
     @staticmethod
     def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
+        params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
         of = OutputFile(fname_out)
-        params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0, file_type=NEFileType.AllF32)
-        of = OutputFile(fname_out)
-        of.write_file_header(params)
+        of.write_file_header(params, file_type=NEFileType.AllF32)
         of.write_vocab(vocab)
         of.fout.close()
 

diff --git a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py
@@ -164,7 +164,7 @@ def guessed(model: 'LazyModel') -> 'Params':
             n_mult=256,
             n_head=n_embd // 128,
             n_head_kv=n_embd // 128,
-            f_norm_eps=1e-5,
+            rms_norm_eps=1e-5,
             n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model),
         )
 
@@ -192,6 +192,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
             ffn_hidden_size=ffn_hidden_size,
             rms_norm_eps=rms_norm_eps,
             rope_theta=rope_theta,
+            rope_scale=rope_scale,
         )
 
     # LLaMA v2 70B params.json
@@ -1064,8 +1065,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
 
         self.fout.write(
             struct.pack("i", 1)
-        )  
-        # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json 
+        )
+        # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
         # but bos_token_id = 1 in llama.cpp
         self.fout.write(struct.pack("i", 2))
 
@@ -1087,10 +1088,9 @@ def write_vocab(self, vocab: Vocab) -> None:
 
     @staticmethod
     def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
+        params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
         of = OutputFile(fname_out)
-        params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0, file_type=NEFileType.AllF32)
-        of = OutputFile(fname_out)
-        of.write_file_header(params)
+        of.write_file_header(params, file_type=NEFileType.AllF32)
         of.write_vocab(vocab)
         of.fout.close()
 

diff --git a/intel_extension_for_transformers/llm/runtime/graph/scripts/requirements/common.txt b/intel_extension_for_transformers/llm/runtime/graph/scripts/requirements/common.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.1.0+cpu
+torch==2.1.0+cpu ; sys_platform != 'darwin'
+torch==2.1.0 ; sys_platform == 'darwin'
 transformers
 numpy
 sentencepiece

diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ py-cpuinfo
 setuptools>=65
 setuptools_scm[toml]>=6.2
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.1.0+cpu
+torch==2.1.0+cpu ; sys_platform != 'darwin'
+torch==2.1.0 ; sys_platform == 'darwin'
 accelerate
 optimum-intel
diff --git a/setup.py b/setup.py
@@ -71,7 +71,7 @@ class CMakeBuild(build_ext):
 
     @staticmethod
     def _is_target_file(file_name: str) -> bool:
-        if file_name.endswith(".dll") or file_name.endswith(".exe") or file_name.endswith(".pyd"):
+        if file_name.endswith(".dll") or file_name.endswith(".exe") or file_name.endswith(".pyd") or file_name.endswith(".dylib"):
             return True
         if file_name.endswith(".so") or ".so." in file_name:
             return True
@@ -234,21 +234,24 @@ def check_submodules():
         end = time.time()
         print(f' --- Submodule initialization took {end - start:.2f} sec')
     except Exception:
-        print(' --- Submodule initalization failed')
+        print(' --- Submodule initialization failed')
         print('Please run:\n\tgit submodule update --init --recursive')
         sys.exit(1)
 
 
 if __name__ == '__main__':
-    ext_modules = [CMakeExtension(
-        "intel_extension_for_transformers.qbits", 'intel_extension_for_transformers/llm/operator/csrc', lib_only=True)]
+    ext_modules = []
+    if sys.platform != "darwin":
+        ext_modules.append(CMakeExtension("intel_extension_for_transformers.qbits",
+                                          "intel_extension_for_transformers/llm/operator/csrc", lib_only=True))
     if not SKIP_RUNTIME:
         check_submodules()
-        ext_modules.extend([
-            CMakeExtension("intel_extension_for_transformers.neural_engine_py", "intel_extension_for_transformers/llm/runtime/deprecated/"),
-            CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.mpt_cpp", "intel_extension_for_transformers/llm/runtime/graph/"),
-            ])
-    cmdclass={'build_ext': CMakeBuild}
+        ext_modules.append(CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.mpt_cpp",
+                                          "intel_extension_for_transformers/llm/runtime/graph/"))
+        if sys.platform != "darwin":
+            ext_modules.append(CMakeExtension("intel_extension_for_transformers.neural_engine_py",
+                                              "intel_extension_for_transformers/llm/runtime/deprecated/"))
+    cmdclass = {'build_ext': CMakeBuild}
 
     setup(
         name="intel-extension-for-transformers",