diff --git a/notebooks/hugging-face-hub/hugging-face-hub.ipynb b/notebooks/hugging-face-hub/hugging-face-hub.ipynb index 14604dfdf72..3a8bc2d1f9e 100644 --- a/notebooks/hugging-face-hub/hugging-face-hub.ipynb +++ b/notebooks/hugging-face-hub/hugging-face-hub.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -36,7 +35,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -48,7 +46,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -58,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -68,7 +65,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -92,7 +88,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -114,7 +109,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n", + "/home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] @@ -131,7 +128,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -176,7 +172,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -203,7 +198,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -212,7 +206,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -227,12 +220,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "076e75b32a964983a4a6df36c1c3d1e0", + "model_id": "d844663d421c4ea9a448d5d44be7f961", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO')" + "Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO')" ] }, "execution_count": 6, @@ -256,7 +249,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -290,7 +282,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -305,7 +296,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -318,7 +308,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -345,8 +334,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.1\u001b[0m\n", + "\u001b[33mDEPRECATION: torchsde 0.2.5 has a non-standard dependency specifier numpy>=1.19.*; python_version >= \"3.7\". pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of torchsde or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] @@ -357,7 +347,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -375,13 +364,6 @@ "execution_count": 9, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n" - ] - }, { "name": "stderr", "output_type": "stream", @@ -390,11 +372,31 @@ "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "2024-07-17 09:40:17.150496: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2024-07-17 09:40:17.152256: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2024-07-17 09:40:17.187913: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2024-07-17 09:40:17.188455: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2024-07-17 09:40:17.937510: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "WARNING[XFORMERS]: xFormers can't load C++/CUDA extensions. xFormers was built for:\n", + " PyTorch 2.0.1+cu118 with CUDA 1108 (you have 2.3.0+cpu)\n", + " Python 3.8.18 (you have 3.8.10)\n", + " Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)\n", + " Memory-efficient attention, SwiGLU, sparse and more won't be available.\n", + " Set XFORMERS_MORE_DETAILS=1 for more details\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n" + "/home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n", + " warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n" ] } ], @@ -403,7 +405,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -422,15 +423,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "Framework not specified. Using pt to export to ONNX.\n", - "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n", + "Framework not specified. Using pt to export the model.\n", + "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Using the export variant default. Available variants are:\n", - " - default: The default ONNX variant.\n", - "Using framework PyTorch: 2.1.0+cpu\n", + "Using framework PyTorch: 2.3.0+cpu\n", "Overriding 1 configuration item(s)\n", - "\t- use_cache -> False\n", + "\t- use_cache -> False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s.\n", "Compiling the model to AUTO ...\n" ] } @@ -443,7 +456,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -461,7 +473,7 @@ "\n", "You can find a mapping between tasks and model classes in Optimum TaskManager [documentation](https://huggingface.co/docs/optimum/exporters/task_manager).\n", "\n", - "Additionally, you can specify weights compression `--fp16` for the compression model to FP16 and `--int8` for the compression model to INT8. Please note, that for INT8, it is necessary to install nncf.\n", + "Additionally, you can specify weights compression using `--weight-format` argument with one of following options: `fp32`, `fp16`, `int8` and `int4`. Fro int8 and int4 nncf will be used for weight compression.\n", "\n", "Full list of supported arguments available via `--help`" ] @@ -485,11 +497,21 @@ "name": "stdout", "output_type": "stream", "text": [ + "2024-07-17 09:40:40.173915: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", "usage: optimum-cli export openvino [-h] -m MODEL [--task TASK]\n", - " [--cache_dir CACHE_DIR]\n", " [--framework {pt,tf}] [--trust-remote-code]\n", - " [--pad-token-id PAD_TOKEN_ID] [--fp16]\n", - " [--int8]\n", + " [--weight-format {fp32,fp16,int8,int4,int4_sym_g128,int4_asym_g128,int4_sym_g64,int4_asym_g64}]\n", + " [--library {transformers,diffusers,timm,sentence_transformers}]\n", + " [--cache_dir CACHE_DIR]\n", + " [--pad-token-id PAD_TOKEN_ID]\n", + " [--ratio RATIO] [--sym]\n", + " [--group-size GROUP_SIZE]\n", + " [--dataset DATASET] [--all-layers] [--awq]\n", + " [--scale-estimation]\n", + " [--sensitivity-metric SENSITIVITY_METRIC]\n", + " [--num-samples NUM_SAMPLES]\n", + " [--disable-stateful]\n", + " [--disable-convert-tokenizer]\n", " output\n", "\n", "optional arguments:\n", @@ -506,21 +528,20 @@ " --task TASK The task to export the model for. If not specified,\n", " the task will be auto-inferred based on the model.\n", " Available tasks depend on the model, but are among:\n", - " ['semantic-segmentation', 'zero-shot-image-\n", - " classification', 'text-generation', 'stable-diffusion-\n", - " xl', 'image-classification', 'image-segmentation',\n", - " 'conversational', 'audio-classification', 'text2text-\n", - " generation', 'automatic-speech-recognition', 'text-to-\n", - " audio', 'audio-frame-classification', 'question-\n", - " answering', 'stable-diffusion', 'mask-generation',\n", - " 'zero-shot-object-detection', 'token-classification',\n", - " 'image-to-text', 'feature-extraction', 'audio-\n", - " xvector', 'text-classification', 'fill-mask', 'object-\n", - " detection', 'multiple-choice', 'masked-im']. For\n", - " decoder models, use `xxx-with-past` to export the\n", - " model using past key values in the decoder.\n", - " --cache_dir CACHE_DIR\n", - " Path indicating where to store cache.\n", + " ['image-to-text', 'audio-frame-classification', 'text-\n", + " generation', 'fill-mask', 'image-segmentation',\n", + " 'audio-xvector', 'semantic-segmentation', 'depth-\n", + " estimation', 'token-classification', 'zero-shot-image-\n", + " classification', 'zero-shot-object-detection',\n", + " 'text2text-generation', 'sentence-similarity',\n", + " 'feature-extraction', 'conversational', 'image-\n", + " classification', 'text-to-audio', 'stable-diffusion',\n", + " 'image-to-image', 'text-classification', 'automatic-\n", + " speech-recognition', 'multiple-choice', 'masked-im',\n", + " 'mask-generation', 'question-answering', 'object-\n", + " detection', 'audio-classification', 'stable-diffusion-\n", + " xl']. For decoder models, use `xxx-with-past` to\n", + " export the model using past key values in the decoder.\n", " --framework {pt,tf} The framework to use for the export. If not provided,\n", " will attempt to use the local checkpoint's original\n", " framework or what is available in the environment.\n", @@ -529,12 +550,77 @@ " for repositories you trust and in which you have read\n", " the code, as it will execute on your local machine\n", " arbitrary code present in the model repository.\n", + " --weight-format {fp32,fp16,int8,int4,int4_sym_g128,int4_asym_g128,int4_sym_g64,int4_asym_g64}\n", + " he weight format of the exported model.\n", + " --library {transformers,diffusers,timm,sentence_transformers}\n", + " The library used to load the model before export. If\n", + " not provided, will attempt to infer the local\n", + " checkpoint's library\n", + " --cache_dir CACHE_DIR\n", + " The path to a directory in which the downloaded model\n", + " should be cached if the standard cache should not be\n", + " used.\n", " --pad-token-id PAD_TOKEN_ID\n", " This is needed by some models, for some tasks. If not\n", " provided, will attempt to use the tokenizer to guess\n", " it.\n", - " --fp16 Compress weights to fp16\n", - " --int8 Compress weights to int8\n" + " --ratio RATIO A parameter used when applying 4-bit quantization to\n", + " control the ratio between 4-bit and 8-bit\n", + " quantization. If set to 0.8, 80% of the layers will be\n", + " quantized to int4 while 20% will be quantized to int8.\n", + " This helps to achieve better accuracy at the sacrifice\n", + " of the model size and inference latency. Default value\n", + " is 1.0.\n", + " --sym Whether to apply symmetric quantization\n", + " --group-size GROUP_SIZE\n", + " The group size to use for quantization. Recommended\n", + " value is 128 and -1 uses per-column quantization.\n", + " --dataset DATASET The dataset used for data-aware compression or\n", + " quantization with NNCF. You can use the one from the\n", + " list ['wikitext2','c4','c4-new'] for language models\n", + " or ['conceptual_captions','laion/220k-GPT4Vision-\n", + " captions-from-LIVIS','laion/filtered-wit'] for\n", + " diffusion models.\n", + " --all-layers Whether embeddings and last MatMul layers should be\n", + " compressed to INT4. If not provided an weight\n", + " compression is applied, they are compressed to INT8.\n", + " --awq Whether to apply AWQ algorithm. AWQ improves\n", + " generation quality of INT4-compressed LLMs, but\n", + " requires additional time for tuning weights on a\n", + " calibration dataset. To run AWQ, please also provide a\n", + " dataset argument. Note: it's possible that there will\n", + " be no matching patterns in the model to apply AWQ, in\n", + " such case it will be skipped.\n", + " --scale-estimation Indicates whether to apply a scale estimation\n", + " algorithm that minimizes the L2 error between the\n", + " original and compressed layers. Providing a dataset is\n", + " required to run scale estimation. Please note, that\n", + " applying scale estimation takes additional memory and\n", + " time.\n", + " --sensitivity-metric SENSITIVITY_METRIC\n", + " The sensitivity metric for assigning quantization\n", + " precision to layers. Can be one of the following:\n", + " ['weight_quantization_error',\n", + " 'hessian_input_activation',\n", + " 'mean_activation_variance', 'max_activation_variance',\n", + " 'mean_activation_magnitude'].\n", + " --num-samples NUM_SAMPLES\n", + " The maximum number of samples to take from the dataset\n", + " for quantization.\n", + " --disable-stateful Disable stateful converted models, stateless models\n", + " will be generated instead. Stateful models are\n", + " produced by default when this key is not used. In\n", + " stateful models all kv-cache inputs and outputs are\n", + " hidden in the model and are not exposed as model\n", + " inputs and outputs. If --disable-stateful option is\n", + " used, it may result in sub-optimal inference\n", + " performance. Use it when you intentionally want to use\n", + " a stateless model, for example, to be compatible with\n", + " existing OpenVINO native inference code that expects\n", + " kv-cache inputs and outputs in the model.\n", + " --disable-convert-tokenizer\n", + " Do not add converted tokenizer and detokenizer\n", + " OpenVINO models.\n" ] } ], @@ -543,7 +629,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -569,24 +654,31 @@ "name": "stdout", "output_type": "stream", "text": [ - "Framework not specified. Using pt to export to ONNX.\n", - "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n", + "2024-07-17 09:40:45.950526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "WARNING[XFORMERS]: xFormers can't load C++/CUDA extensions. xFormers was built for:\n", + " PyTorch 2.0.1+cu118 with CUDA 1108 (you have 2.3.0+cpu)\n", + " Python 3.8.18 (you have 3.8.10)\n", + " Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)\n", + " Memory-efficient attention, SwiGLU, sparse and more won't be available.\n", + " Set XFORMERS_MORE_DETAILS=1 for more details\n", + "/home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n", + " warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n", + "/home/ea/work/my_optimum_intel/optimum_env/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n", + "Framework not specified. Using pt to export the model.\n", + "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Using the export variant default. Available variants are:\n", - " - default: The default ONNX variant.\n", - "Using framework PyTorch: 2.1.0+cpu\n", + "Using framework PyTorch: 2.3.0+cpu\n", "Overriding 1 configuration item(s)\n", "\t- use_cache -> False\n" ] } ], "source": [ - "!optimum-cli export openvino --model $MODEL --task text-classification --fp16 models/optimum_model/fp16" + "!optimum-cli export openvino --model $MODEL --task text-classification --weight-format fp16 models/optimum_model/fp16" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -602,8 +694,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Compiling the model to AUTO ...\n", - "Setting OpenVINO CACHE_DIR to models/optimum_model/fp16/model_cache\n" + "Compiling the model to AUTO ...\n" ] } ], @@ -612,7 +703,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -620,7 +710,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -654,7 +743,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -703,7 +791,39 @@ }, "widgets": { "application/vnd.jupyter.widget-state+json": { - "state": {}, + "state": { + "087c01ae5c9c44f3a1a0730b2d856f97": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "0dc8780d967e49ad89fc4d76f64a1b06": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "d844663d421c4ea9a448d5d44be7f961": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "CPU", + "GPU.0", + "GPU.1", + "AUTO" + ], + "description": "Device:", + "index": 3, + "layout": "IPY_MODEL_0dc8780d967e49ad89fc4d76f64a1b06", + "style": "IPY_MODEL_087c01ae5c9c44f3a1a0730b2d856f97" + } + } + }, "version_major": 2, "version_minor": 0 }