Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/support older intel mac book pro with gcc 13 #1085

Draft
wants to merge 16 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ docker compose build
OR
```bash
docker pull intel/ai-tools:itrex-1.3.0
docker pull intel/ai-tools:itrex-devel-1.3.0
docker pull intel/ai-tools:itrex-1.3.0-devel
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the correction. I'm not familiar with docker but it seems that you are right. To be confirmed by @tylertitsworth?

```

## Use Docker Image
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ def forward(self, sequence_output, pooled_output):

class BertPreTrainedModel(nn.Module):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
a simple interface for downloading and loading pretrained models.
"""
def __init__(self, config, *inputs, **kwargs):
super(BertPreTrainedModel, self).__init__()
Expand Down Expand Up @@ -663,7 +663,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_d
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ inline float get_mxfp_maxnorm(const JBLAS_DTYPE t, int ebits, int mantissa_bits)
return max_norm;
}

#ifndef _WIN32
#if !defined(_WIN32) && !defined(__APPLE__)
static void request_perm_xtile_data() {
unsigned long bitmask;
long rc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1228,7 +1228,7 @@ class PaddingTransInterleaveCvt : protected xbyak::JitAvx512f {
jmp(ptr[reg_tmp + reg_tmp2 * sizeof(void*)], T_NEAR); // switch(rows-iterrow) ...
align(sizeof(intptr_t));
L(l_tail_tbl);
db(reinterpret_cast<uintptr_t>(nullptr), sizeof(intptr_t)); // case 0 should never occur
db(reinterpret_cast<uint64_t>(nullptr), sizeof(intptr_t)); // case 0 should never occur
for (int i = 1; i < trans_cell; ++i) putL(l_tail_case[i]);

for (int m_tail = 1; m_tail < trans_cell; ++m_tail) { // case (m_tail):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __import_package(self, model_type):
elif model_type == "mistral":
import intel_extension_for_transformers.llm.runtime.graph.mistral_cpp as cpp_model
else:
raise TypeError("Unspported model type {}!".format(model_type))
raise TypeError("Unsupported model type {}!".format(model_type))
self.module = cpp_model

@staticmethod
Expand Down Expand Up @@ -134,7 +134,7 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
import platform
sys_platform = platform.platform().lower()
if threads is None:
if "windows" in sys_platform:
if "windows" in sys_platform or "macos" in sys_platform:
cpu_count = os.cpu_count()
generate_kwargs["threads"] = int(cpu_count)
else:
Expand Down Expand Up @@ -212,7 +212,7 @@ def eos_token_id(self):
if self.model_type == 'qwen':
return self.tokenizer.special_tokens['<|endoftext|>']
return self.tokenizer.eos_token_id

def pad_token_id(self):
if self.tokenizer.pad_token_id == None:
if self.batch_size == 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ else ()
target_link_libraries(ne_layers PUBLIC Threads::Threads jblas::jblas ne_vec)
endif()

if(NOT WIN32)
if(NOT WIN32 AND NOT APPLE)
target_link_libraries(ne_layers PUBLIC rt)
endif()

Expand All @@ -54,7 +54,7 @@ function(add_test_target src)
target_link_options(${test_target} PRIVATE -fsanitize=address)
target_include_directories(${test_target} PUBLIC .)
target_link_libraries(${test_target} PUBLIC Threads::Threads jblas::jblas ne_vec)
if(NOT WIN32)
if(NOT WIN32 AND NOT APPLE)
target_link_libraries(${test_target} PUBLIC rt)
endif()
add_test(NAME ${test_target} COMMAND ${test_target})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ graph LR;

We need to implement corresponding serialization methods from pytorch format, which is mainly divided into the following three steps.

## 1.1. Hyperparamters
The term **"hyperparamters"** describes a value that is used to configure the behavior of a large language model; this is in contrast to the model's parameters, which are the weight that were derived in the training process that was used to create the model. Each model defines its own hyperparameter structure that defines the hyperparameter values accepted by that model. Valid ITREX graph files must list these values in the correct order, and each value must be represented using the correct data type. Although hyperparameters are different across models, some attributes appear in the hyperparameters for most models:
## 1.1. Hyperparameters
The term **"hyperparameters"** describes a value that is used to configure the behavior of a large language model; this is in contrast to the model's parameters, which are the weight that were derived in the training process that was used to create the model. Each model defines its own hyperparameter structure that defines the hyperparameter values accepted by that model. Valid ITREX graph files must list these values in the correct order, and each value must be represented using the correct data type. Although hyperparameters are different across models, some attributes appear in the hyperparameters for most models:
- n_vocab: the size of the model's vocabulary
- n_embd: the size of the model's " embedding layer", which is used during prompt ingestion.
- n_layer: the number of layers in the model; each layer represents a set of weights.
Expand Down Expand Up @@ -328,7 +328,7 @@ Most of our model examples only support single prompt processing. You need to ad
+set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_link_libraries(${TARGET} PUBLIC ne_layers jblas::jblas)
```
and and new_model to [models_CMakeLists.txt](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/llm/runtime/graph/models/CMakeLists.txt).
and new_model to [models_CMakeLists.txt](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/llm/runtime/graph/models/CMakeLists.txt).
```diff
add_subdirectory(opt)
add_subdirectory(bloom)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ int32_t get_num_physical_cores() {
#elif defined(__APPLE__) && defined(__MACH__)
int32_t num_physical_cores;
size_t len = sizeof(num_physical_cores);
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
int result = syscall(SYS_sysctlbyname, "hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
result = syscall(SYS_sysctlbyname, "hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def guessed(model: 'LazyModel') -> 'Params':
n_mult=256,
n_head=n_embd // 128,
n_head_kv=n_embd // 128,
f_norm_eps=1e-5,
rms_norm_eps=1e-5,
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model),
)

Expand Down Expand Up @@ -203,7 +203,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
)

# LLaMA v2 70B params.json
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8,
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8,
# "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
@staticmethod
def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
Expand All @@ -230,8 +230,8 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
n_head=n_head,
n_head_kv=n_head_kv,
ffn_hidden_size=ffn_hidden_size,
bos_token_id = bos_token_id,
eos_token_id = eos_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
)

@staticmethod
Expand Down Expand Up @@ -278,7 +278,7 @@ def __init__(self, fname_tokenizer: Path, params_vocab_size: int, fname_added_to
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(self.params_vocab_size):
text: bytes
text: bytes
if i < tokenizer.vocab_size():
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
Expand Down Expand Up @@ -1086,7 +1086,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))

# TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
# TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
# but bos_token_id = 1 in llama.cpp
self.fout.write(struct.pack("i", params.bos_token_id))
self.fout.write(struct.pack("i", params.eos_token_id))
Expand All @@ -1108,10 +1108,9 @@ def write_vocab(self, vocab: Vocab) -> None:

@staticmethod
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
of = OutputFile(fname_out)
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0, file_type=NEFileType.AllF32)
of = OutputFile(fname_out)
of.write_file_header(params)
of.write_file_header(params, file_type=NEFileType.AllF32)
of.write_vocab(vocab)
of.fout.close()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def guessed(model: 'LazyModel') -> 'Params':
n_mult=256,
n_head=n_embd // 128,
n_head_kv=n_embd // 128,
f_norm_eps=1e-5,
rms_norm_eps=1e-5,
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model),
)

Expand Down Expand Up @@ -192,6 +192,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
ffn_hidden_size=ffn_hidden_size,
rms_norm_eps=rms_norm_eps,
rope_theta=rope_theta,
rope_scale=rope_scale,
)

# LLaMA v2 70B params.json
Expand Down Expand Up @@ -1064,8 +1065,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:

self.fout.write(
struct.pack("i", 1)
)
# TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
)
# TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
# but bos_token_id = 1 in llama.cpp
self.fout.write(struct.pack("i", 2))

Expand All @@ -1087,10 +1088,9 @@ def write_vocab(self, vocab: Vocab) -> None:

@staticmethod
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
of = OutputFile(fname_out)
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0, file_type=NEFileType.AllF32)
of = OutputFile(fname_out)
of.write_file_header(params)
of.write_file_header(params, file_type=NEFileType.AllF32)
of.write_vocab(vocab)
of.fout.close()

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.1.0+cpu
torch==2.1.0+cpu ; sys_platform != 'darwin'
torch==2.1.0 ; sys_platform == 'darwin'
transformers
numpy
sentencepiece
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ py-cpuinfo
setuptools>=65
setuptools_scm[toml]>=6.2
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.1.0+cpu
torch==2.1.0+cpu ; sys_platform != 'darwin'
torch==2.1.0 ; sys_platform == 'darwin'
accelerate
optimum-intel
21 changes: 12 additions & 9 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class CMakeBuild(build_ext):

@staticmethod
def _is_target_file(file_name: str) -> bool:
if file_name.endswith(".dll") or file_name.endswith(".exe") or file_name.endswith(".pyd"):
if file_name.endswith(".dll") or file_name.endswith(".exe") or file_name.endswith(".pyd") or file_name.endswith(".dylib"):
return True
if file_name.endswith(".so") or ".so." in file_name:
return True
Expand Down Expand Up @@ -234,21 +234,24 @@ def check_submodules():
end = time.time()
print(f' --- Submodule initialization took {end - start:.2f} sec')
except Exception:
print(' --- Submodule initalization failed')
print(' --- Submodule initialization failed')
print('Please run:\n\tgit submodule update --init --recursive')
sys.exit(1)


if __name__ == '__main__':
ext_modules = [CMakeExtension(
"intel_extension_for_transformers.qbits", 'intel_extension_for_transformers/llm/operator/csrc', lib_only=True)]
ext_modules = []
if sys.platform != "darwin":
ext_modules.append(CMakeExtension("intel_extension_for_transformers.qbits",
"intel_extension_for_transformers/llm/operator/csrc", lib_only=True))
if not SKIP_RUNTIME:
check_submodules()
ext_modules.extend([
CMakeExtension("intel_extension_for_transformers.neural_engine_py", "intel_extension_for_transformers/llm/runtime/deprecated/"),
CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.mpt_cpp", "intel_extension_for_transformers/llm/runtime/graph/"),
])
cmdclass={'build_ext': CMakeBuild}
ext_modules.append(CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.mpt_cpp",
"intel_extension_for_transformers/llm/runtime/graph/"))
if sys.platform != "darwin":
ext_modules.append(CMakeExtension("intel_extension_for_transformers.neural_engine_py",
"intel_extension_for_transformers/llm/runtime/deprecated/"))
cmdclass = {'build_ext': CMakeBuild}

setup(
name="intel-extension-for-transformers",
Expand Down