From 38e3cd077d65bac55cccf44ce50a40e475c1b934 Mon Sep 17 00:00:00 2001 From: Guy Nicholson <35084975+guynich@users.noreply.github.com> Date: Sun, 26 Feb 2023 08:13:33 -0800 Subject: [PATCH 1/3] Adds documentation for the default_n_bit quantization scheme. Provides more information and code examples. --- .../experimental/default_n_bit/README.md | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md b/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md index a313fa4ef..e959e9ab7 100644 --- a/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md +++ b/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md @@ -1,2 +1,34 @@ This directory is modified based on default_8bit, which allows you to manually change the number of bits of weight and activation in QAT. + +Code example given a Keras float `model`: + +``` +# Imports. +import tensorflow_model_optimization as tfmot + +from tensorflow_model_optimization.python.core.quantization.keras.quantize import quantize_annotate_model +from tensorflow_model_optimization.python.core.quantization.keras.quantize import quantize_apply + +from tensorflow_model_optimization.python.core.quantization.keras.experimental.default_n_bit import default_n_bit_quantize_scheme + + +# TODO(user): define Keras float model. + +# Specify scheme with 4-bit weights and 8-bit activations. +qat_scheme_4w8a = default_n_bit_quantize_scheme.DefaultNBitQuantizeScheme( + num_bits_weight=4, + num_bits_activation=8, +) + +# Annotate the model for quantized aware training. +with tfmot.quantization.keras.quantize_scope(): + quantized_aware_model = quantize_apply( + quantize_annotate_model(model), + qat_scheme_4w8a, + ) + +# TODO(user): compile and train `quantized_aware_model` using standard Keras methods. +``` + +To improve task quality it may be necessary to specify higher weight precision for the first and last layers of the model such as 8-bit. This can be achieved using wrapper code per layer. A code example is shown in [kws_streaming](https://github.com/google-research/google-research/commit/c87bac8133e00dc4fe646c182072676146312e0f) framework in Google Research repository. From 5da151f989924261b351db9352596e9b9c765425 Mon Sep 17 00:00:00 2001 From: Guy Nicholson <35084975+guynich@users.noreply.github.com> Date: Tue, 28 Feb 2023 09:37:45 -0800 Subject: [PATCH 2/3] Update README.md Added notes on TF Lite conversion. --- .../quantization/keras/experimental/default_n_bit/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md b/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md index e959e9ab7..396579bd0 100644 --- a/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md +++ b/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md @@ -28,7 +28,11 @@ with tfmot.quantization.keras.quantize_scope(): qat_scheme_4w8a, ) -# TODO(user): compile and train `quantized_aware_model` using standard Keras methods. +# TODO(user): compile and train quantized_aware_model using standard Keras methods. ``` +The recommended activation precision is 8-bit for TF Lite conversion. + +Before TF 2.11.0 the TF Lite converted weight value is stored one per byte in the weight tensor, so a 4-bit weight using default_n_bit scheme will be integer [-7, 7] occupying a byte. With TF 2.11.0 and release candidate TF 2.12.0 weight packing for 4-bit weights is added for selected operators, so two 4-bit weights are packed per byte for the regular convolution operator in TF 2.11.0. + To improve task quality it may be necessary to specify higher weight precision for the first and last layers of the model such as 8-bit. This can be achieved using wrapper code per layer. A code example is shown in [kws_streaming](https://github.com/google-research/google-research/commit/c87bac8133e00dc4fe646c182072676146312e0f) framework in Google Research repository. From 9d8147a6e1f85346d7135f91f1d69d16f786f117 Mon Sep 17 00:00:00 2001 From: Guy Nicholson <35084975+guynich@users.noreply.github.com> Date: Tue, 28 Feb 2023 09:42:05 -0800 Subject: [PATCH 3/3] Update README.md Documentation changes. --- .../keras/experimental/default_n_bit/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md b/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md index 396579bd0..e7477b7ec 100644 --- a/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md +++ b/tensorflow_model_optimization/python/core/quantization/keras/experimental/default_n_bit/README.md @@ -1,7 +1,7 @@ This directory is modified based on default_8bit, which allows you to manually change the number of bits of weight and activation in QAT. -Code example given a Keras float `model`: +Code example for quantization of a Keras float `model`: ``` # Imports. @@ -15,13 +15,13 @@ from tensorflow_model_optimization.python.core.quantization.keras.experimental.d # TODO(user): define Keras float model. -# Specify scheme with 4-bit weights and 8-bit activations. +# Specify quantization scheme with 4-bit weights and 8-bit activations. qat_scheme_4w8a = default_n_bit_quantize_scheme.DefaultNBitQuantizeScheme( num_bits_weight=4, num_bits_activation=8, ) -# Annotate the model for quantized aware training. +# Prepare the model for quantized aware training. with tfmot.quantization.keras.quantize_scope(): quantized_aware_model = quantize_apply( quantize_annotate_model(model), @@ -31,7 +31,7 @@ with tfmot.quantization.keras.quantize_scope(): # TODO(user): compile and train quantized_aware_model using standard Keras methods. ``` -The recommended activation precision is 8-bit for TF Lite conversion. +Recommended activation precision is 8-bit for TF Lite conversion. Before TF 2.11.0 the TF Lite converted weight value is stored one per byte in the weight tensor, so a 4-bit weight using default_n_bit scheme will be integer [-7, 7] occupying a byte. With TF 2.11.0 and release candidate TF 2.12.0 weight packing for 4-bit weights is added for selected operators, so two 4-bit weights are packed per byte for the regular convolution operator in TF 2.11.0.