Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unambiguous compression setup to resume properly #682

Merged
merged 46 commits into from
Jul 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
d779825
Unambiguous compression setup to resume properly
ljaljushkin Jun 22, 2021
e10677f
Unambiguous compression state for TF to resume properly (#2)
ljaljushkin Jun 24, 2021
783ae47
Merge remote-tracking branch 'origin/develop' into qsetup_ckpt
ljaljushkin Jun 24, 2021
0ca21c6
buggy input quantization
ljaljushkin Jun 29, 2021
27f0fb1
WA for the evil bug 58886 with imports
ljaljushkin Jun 29, 2021
81c3f63
Merge branch 'buggy_merge_and_fixes' into qsetup_ckpt
ljaljushkin Jun 29, 2021
9040d33
Merge remote-tracking branch 'origin/develop' into qsetup_ckpt
ljaljushkin Jun 29, 2021
dc2c25d
fixed code style
ljaljushkin Jun 29, 2021
664df52
WA for the issue with matching layer indexes from builder state and f…
ljaljushkin Jun 30, 2021
28e8f79
Kept order of imports unchanged
ljaljushkin Jun 30, 2021
686f1d9
Single commit + WA for layer indexes + imports order preserving
ljaljushkin Jun 30, 2021
150a59a
Merge remote-tracking branch 'fork/qsetup_ckpt' into qsetup_ckpt
ljaljushkin Jun 30, 2021
08880e0
load checkpoints corrently for segmentation sample
ljaljushkin Jun 30, 2021
5fed3f2
WA for evil bug with order of imports.
ljaljushkin Jun 30, 2021
a3876f0
Removed CompressionState for PT, compression state is Dict now
ljaljushkin Jul 1, 2021
274e935
Fixed some tests
ljaljushkin Jul 1, 2021
cdf3c9c
Just warn about inconsistent compression stage
ljaljushkin Jul 1, 2021
443a25d
Proper saving checkpoint in OD
ljaljushkin Jul 1, 2021
d516aab
Compression state for NNCF TF (#4)
andrey-churkin Jul 2, 2021
e6b3144
Aligned batch size of resnet50_imagenet_rb_sparsity50_int8 for sota eval
ljaljushkin Jul 2, 2021
41281e0
Merge remote-tracking branch 'fork/qsetup_ckpt' into qsetup_ckpt
ljaljushkin Jul 2, 2021
f02f29a
Merge remote-tracking branch 'origin/develop' into qsetup_ckpt
ljaljushkin Jul 2, 2021
0e1e1b1
code style
ljaljushkin Jul 2, 2021
731f1df
Resolved comment
ljaljushkin Jul 5, 2021
5c55436
Merge remote-tracking branch 'origin/develop' into qsetup_ckpt
ljaljushkin Jul 5, 2021
f00cddc
Unambiguous compression state in a single commit +
ljaljushkin Jul 5, 2021
4d6ee59
code style
ljaljushkin Jul 5, 2021
20d4960
Merge branch 'squashed_682' into qsetup_ckpt
ljaljushkin Jul 5, 2021
b3f9c4c
Merge remote-tracking branch 'origin/develop' into qsetup_ckpt
ljaljushkin Jul 5, 2021
ba8edda
code style one more time
ljaljushkin Jul 5, 2021
f2d5857
force_no_init on install
ljaljushkin Jul 5, 2021
a993337
corrected test with bn init
ljaljushkin Jul 5, 2021
ac0e772
Fixed segmentation sanity eval
ljaljushkin Jul 5, 2021
111b933
too many statements for OD-TF
ljaljushkin Jul 5, 2021
d0c85bc
return for get_state
ljaljushkin Jul 5, 2021
edab50d
Documentation
ljaljushkin Jul 5, 2021
4fed3e6
CONSISTENCY with state names
ljaljushkin Jul 5, 2021
eb1837c
empty line between import and licence
ljaljushkin Jul 5, 2021
9501b55
consistency ...
ljaljushkin Jul 5, 2021
5d71f7c
pylint, don't worry...
ljaljushkin Jul 5, 2021
e0598fb
prepare_checkpoint without initialization and compression state to av…
ljaljushkin Jul 5, 2021
ec20c57
Fixed invalid shape of images for init in test
ljaljushkin Jul 5, 2021
c5d3332
Fixed Mask-RCNN eval sanity
ljaljushkin Jul 5, 2021
de2c64c
Skipped prepare_checkpoint as it doesn't resume properly
ljaljushkin Jul 5, 2021
67893de
Reverted sensitive names in the builder state + backward compat. tests
ljaljushkin Jul 6, 2021
347a36d
Fixed invalid name
ljaljushkin Jul 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions examples/tensorflow/classification/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from nncf.tensorflow import create_compression_callbacks
from nncf.tensorflow.helpers.model_manager import TFOriginalModelManager
from nncf.tensorflow.initialization import register_default_init_args
from nncf.tensorflow.utils.state import TFCompressionState
from nncf.tensorflow.utils.state import TFCompressionStateLoader

from examples.tensorflow.classification.datasets.builder import DatasetBuilder
from examples.tensorflow.common.argparser import get_common_argument_parser
Expand Down Expand Up @@ -131,6 +133,12 @@ def resume_from_checkpoint(checkpoint, ckpt_path, steps_per_epoch):
return initial_epoch


def load_compression_state(ckpt_path: str):
checkpoint = tf.train.Checkpoint(compression_state=TFCompressionStateLoader())
load_checkpoint(checkpoint, ckpt_path)
return checkpoint.compression_state.state


def run(config):
strategy = get_distribution_strategy(config)
if config.metrics_dump is not None:
Expand Down Expand Up @@ -168,13 +176,14 @@ def run(config):
return_dict=True)
uncompressed_model_accuracy = 100 * results['acc@1']

compression_state = None
if resume_training:
compression_state = load_compression_state(config.ckpt_path)

with TFOriginalModelManager(model_fn, **model_params) as model:
with strategy.scope():
compression_ctrl, compress_model = create_compressed_model(model,
nncf_config,
should_init=not resume_training)
compression_callbacks = create_compression_callbacks(compression_ctrl,
log_dir=config.log_dir)
compression_ctrl, compress_model = create_compressed_model(model, nncf_config, compression_state)
compression_callbacks = create_compression_callbacks(compression_ctrl, log_dir=config.log_dir)

scheduler = build_scheduler(
config=config,
Expand All @@ -201,7 +210,8 @@ def run(config):

compress_model.summary()

checkpoint = tf.train.Checkpoint(model=compress_model, compression_ctrl=compression_ctrl)
checkpoint = tf.train.Checkpoint(model=compress_model,
compression_state=TFCompressionState(compression_ctrl))

initial_epoch = 0
if resume_training:
Expand Down Expand Up @@ -279,9 +289,12 @@ def export(config):
pretrained=config.get('pretrained', False),
weights=config.get('weights', None))
model = model(**model_params)
compression_ctrl, compress_model = create_compressed_model(model,
config.nncf_config,
should_init=False)

compression_state = None
if config.ckpt_path:
compression_state = load_compression_state(config.ckpt_path)

compression_ctrl, compress_model = create_compressed_model(model, config.nncf_config, compression_state)

metrics = [
tf.keras.metrics.CategoricalAccuracy(name='acc@1'),
Expand All @@ -293,7 +306,8 @@ def export(config):
metrics=metrics)
compress_model.summary()

checkpoint = tf.train.Checkpoint(model=compress_model, compression_ctrl=compression_ctrl)
checkpoint = tf.train.Checkpoint(model=compress_model,
compression_state=TFCompressionState(compression_ctrl))

if config.ckpt_path is not None:
load_checkpoint(checkpoint=checkpoint,
Expand Down
23 changes: 15 additions & 8 deletions examples/tensorflow/common/prepare_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import tensorflow as tf

from nncf.tensorflow.helpers.model_creation import create_compressed_model
from nncf.tensorflow.utils.state import TFCompressionState
from nncf.tensorflow.utils.state import TFCompressionStateLoader
from examples.tensorflow.common.logger import logger
from examples.tensorflow.common.sample_config import create_sample_config
from examples.tensorflow.common.argparser import get_common_argument_parser
Expand Down Expand Up @@ -71,18 +73,24 @@ def load_checkpoint(checkpoint, ckpt_path):
return None


def load_compression_state(ckpt_path: str):
checkpoint = tf.train.Checkpoint(compression_state=TFCompressionStateLoader())
load_checkpoint(checkpoint, ckpt_path)
return checkpoint.compression_state.state


def od_checkpoint_saver(config):
"""
Load object detection checkpoint and re-save it without optimizer (memory footprint is reduced).
"""
model_builder = get_model_od_builder(config)
model = model_builder.build_model()

compression_ctrl, compress_model = create_compressed_model(model,
config.nncf_config,
should_init=False)
compression_state = load_compression_state(config.ckpt_path)
compression_ctrl, compress_model = create_compressed_model(model, config.nncf_config, compression_state)

checkpoint = tf.train.Checkpoint(model=compress_model, compression_ctrl=compression_ctrl)
checkpoint = tf.train.Checkpoint(model=compress_model,
compression_state=TFCompressionState(compression_ctrl))
load_and_save_checkpoint(checkpoint, config)


Expand All @@ -93,13 +101,12 @@ def seg_checkpoint_saver(config):
model_builder = get_model_seg_builder(config)
model = model_builder.build_model()

compression_ctrl, compress_model = create_compressed_model(model,
config.nncf_config,
should_init=False)
compression_state = load_compression_state(config.ckpt_path)
compression_ctrl, compress_model = create_compressed_model(model, config.nncf_config, compression_state)

variables = get_variables(compress_model)
checkpoint = tf.train.Checkpoint(variables=variables,
compression_ctrl=compression_ctrl,
compression_state=TFCompressionState(compression_ctrl),
step=tf.Variable(0))
load_and_save_checkpoint(checkpoint, config)

Expand Down
37 changes: 24 additions & 13 deletions examples/tensorflow/object_detection/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from nncf.common.utils.tensorboard import prepare_for_tensorboard
from nncf.config.utils import is_accuracy_aware_training
from nncf.config.structures import ModelEvaluationArgs
from nncf.tensorflow.utils.state import TFCompressionState
from nncf.tensorflow.utils.state import TFCompressionStateLoader

from examples.tensorflow.common.argparser import get_common_argument_parser
from examples.tensorflow.common.distributed import get_distribution_strategy
Expand Down Expand Up @@ -120,6 +122,12 @@ def resume_from_checkpoint(checkpoint_manager, ckpt_path, steps_per_epoch):
return initial_epoch, initial_step


def load_compression_state(ckpt_path: str):
checkpoint = tf.train.Checkpoint(compression_state=TFCompressionStateLoader())
load_checkpoint(checkpoint, ckpt_path)
return checkpoint.compression_state.state


def create_test_step_fn(strategy, model, predict_post_process_fn):
"""Creates a distributed test step"""

Expand Down Expand Up @@ -271,10 +279,9 @@ def run(config):
write_metrics(0, config.metrics_dump)

# Create dataset
builders = get_dataset_builders(config, strategy.num_replicas_in_sync)
datasets = [builder.build() for builder in builders]
train_builder, test_builder = builders
train_dataset, test_dataset = datasets
train_builder, test_builder = get_dataset_builders(config, strategy.num_replicas_in_sync)
train_dataset = train_builder.build()
test_dataset = test_builder.build()
train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)

Expand All @@ -300,13 +307,15 @@ def model_eval_fn(model):

resume_training = config.ckpt_path is not None

compression_state = None
if resume_training:
compression_state = load_compression_state(config.ckpt_path)

with TFOriginalModelManager(model_builder.build_model,
weights=config.get('weights', None)) as model:
with strategy.scope():
config.nncf_config.register_extra_structs([ModelEvaluationArgs(eval_fn=model_eval_fn)])
compression_ctrl, compress_model = create_compressed_model(model,
nncf_config,
should_init=not resume_training)
compression_ctrl, compress_model = create_compressed_model(model, nncf_config, compression_state)
scheduler = build_scheduler(
config=config,
steps_per_epoch=steps_per_epoch)
Expand All @@ -321,7 +330,7 @@ def model_eval_fn(model):

checkpoint = tf.train.Checkpoint(model=compress_model,
optimizer=optimizer,
compression_ctrl=compression_ctrl)
compression_state=TFCompressionState(compression_ctrl))
checkpoint_manager = tf.train.CheckpointManager(checkpoint, config.checkpoint_save_dir, max_to_keep=None)

initial_epoch = initial_step = 0
Expand Down Expand Up @@ -358,7 +367,6 @@ def validate_fn(model, epoch):
validate_fn=validate_fn,
tensorboard_writer=config.tb,
log_dir=config.log_dir)

else:
train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset,
initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager,
Expand All @@ -382,12 +390,15 @@ def export(config):
model_builder = get_model_builder(config)
model = model_builder.build_model(weights=config.get('weights', None))

compression_ctrl, compress_model = create_compressed_model(model,
config.nncf_config,
should_init=False)
compression_state = None
if config.ckpt_path:
compression_state = load_compression_state(config.ckpt_path)

compression_ctrl, compress_model = create_compressed_model(model, config.nncf_config, compression_state)

if config.ckpt_path:
checkpoint = tf.train.Checkpoint(model=compress_model)
checkpoint = tf.train.Checkpoint(model=compress_model,
compression_state=TFCompressionState(compression_ctrl))
load_checkpoint(checkpoint, config.ckpt_path)

save_path, save_format = get_saving_parameters(config)
Expand Down
104 changes: 64 additions & 40 deletions examples/tensorflow/segmentation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
import tensorflow as tf

from nncf.tensorflow import create_compressed_model
from nncf.tensorflow import register_default_init_args
from nncf.tensorflow.helpers.model_manager import TFOriginalModelManager
from nncf.tensorflow.utils.state import TFCompressionState
from nncf.tensorflow.utils.state import TFCompressionStateLoader

from examples.tensorflow.common.argparser import get_common_argument_parser
from examples.tensorflow.common.distributed import get_distribution_strategy
Expand Down Expand Up @@ -90,8 +93,12 @@ def get_dataset_builders(config, num_devices):
val_builder = COCODatasetBuilder(config=config,
is_train=False,
num_devices=num_devices)

return val_builder
config_ = config.deepcopy()
config_.batch_size = val_builder.batch_size
calibration_builder = COCODatasetBuilder(config=config_,
is_train=True,
num_devices=1)
return val_builder, calibration_builder
ljaljushkin marked this conversation as resolved.
Show resolved Hide resolved


def load_checkpoint(checkpoint, ckpt_path):
Expand All @@ -115,6 +122,12 @@ def load_checkpoint(checkpoint, ckpt_path):
return None


def load_compression_state(ckpt_path: str):
checkpoint = tf.train.Checkpoint(compression_state=TFCompressionStateLoader())
load_checkpoint(checkpoint, ckpt_path)
return checkpoint.compression_state.state


def evaluate(test_step, metric, test_dist_dataset, num_batches, print_freq):
"""Runs evaluation steps and aggregate metrics"""
timer = Timer()
Expand Down Expand Up @@ -164,41 +177,56 @@ def test_step(dataset_inputs):
return test_step


def run_evaluation(config, eval_timeout=None):
"""Runs evaluation on checkpoint save directory"""
strategy = get_distribution_strategy(config)
if config.metrics_dump is not None:
write_metrics(0, config.metrics_dump)

dataset_builder = get_dataset_builders(config, strategy.num_replicas_in_sync)
dataset = dataset_builder.build()
num_batches = dataset_builder.steps_per_epoch
test_dist_dataset = strategy.experimental_distribute_dataset(dataset)

# We use `model_batch_size` to create input layer for model
config.model_batch_size = dataset_builder.batch_size

model_builder = get_model_builder(config)
def restore_compressed_model(config, strategy, model_builder, ckpt_path = None):
compression_state = None
if ckpt_path:
compression_state = load_compression_state(ckpt_path)

with TFOriginalModelManager(model_builder.build_model,
weights=config.get('weights', None),
is_training=False) as model:
with strategy.scope():
compression_ctrl, compress_model = create_compressed_model(model,
config.nncf_config,
should_init=False)
compression_state)

variables = get_variables(compress_model)
checkpoint = tf.train.Checkpoint(variables=variables,
compression_ctrl=compression_ctrl,
compression_state=TFCompressionState(compression_ctrl),
step=tf.Variable(0))
eval_metric = model_builder.eval_metrics()
predict_post_process_fn = model_builder.post_processing
if ckpt_path:
load_checkpoint(checkpoint, config.ckpt_path)

test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn)
return compression_ctrl, compress_model, checkpoint


def run_evaluation(config, eval_timeout=None):
"""Runs evaluation on checkpoint save directory"""
strategy = get_distribution_strategy(config)
if config.metrics_dump is not None:
write_metrics(0, config.metrics_dump)

validation_builder, calibration_builder = get_dataset_builders(config, strategy.num_replicas_in_sync)
calibration_dataset = calibration_builder.build()
val_dataset = validation_builder.build()
num_batches = validation_builder.steps_per_epoch
test_dist_dataset = strategy.experimental_distribute_dataset(val_dataset)

config.nncf_config = register_default_init_args(nncf_config=config.nncf_config,
data_loader=calibration_dataset,
batch_size=validation_builder.global_batch_size)
ljaljushkin marked this conversation as resolved.
Show resolved Hide resolved

# We use `model_batch_size` to create input layer for model
config.model_batch_size = validation_builder.batch_size

model_builder = get_model_builder(config)
eval_metric = model_builder.eval_metrics()
predict_post_process_fn = model_builder.post_processing

if 'test' in config.mode:
if config.ckpt_path:
load_checkpoint(checkpoint, config.ckpt_path)
compression_ctrl, compress_model, _ = restore_compressed_model(config, strategy, model_builder,
config.ckpt_path)
test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn)

statistics = compression_ctrl.statistics()
logger.info(statistics.to_str())
Expand All @@ -213,12 +241,17 @@ def run_evaluation(config, eval_timeout=None):

elif 'train' in config.mode:
validation_summary_writer = SummaryWriter(config.log_dir, 'validation')
checkpoint_dir = config.checkpoint_save_dir
eval_timeout = config.eval_timeout

for checkpoint_path in tf.train.checkpoints_iterator(checkpoint_dir, timeout=eval_timeout):
status = checkpoint.restore(checkpoint_path)
status.expect_partial()
is_first_checkpoint = True
for checkpoint_path in tf.train.checkpoints_iterator(config.checkpoint_save_dir, config.eval_timeout):
if is_first_checkpoint:
is_first_checkpoint = False
_, compress_model, checkpoint = restore_compressed_model(config, strategy, model_builder,
checkpoint_path)
test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn)
else:
checkpoint.restore(checkpoint_path).expect_partial()

logger.info('Checkpoint file {} found and restoring from checkpoint'.format(checkpoint_path))
logger.info('Checkpoint step: {}'.format(checkpoint.step.numpy()))
metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_batches, config.print_freq)
Expand All @@ -238,17 +271,8 @@ def run_evaluation(config, eval_timeout=None):
def export(config):
model_builder = get_model_builder(config)

with TFOriginalModelManager(model_builder.build_model,
weights=config.get('weights', None),
is_training=False) as model:
compression_ctrl, compress_model = create_compressed_model(model,
config.nncf_config,
should_init=False)

if config.ckpt_path:
variables = get_variables(compress_model)
checkpoint = tf.train.Checkpoint(variables=variables)
load_checkpoint(checkpoint, config.ckpt_path)
strategy = tf.distribute.get_strategy()
compression_ctrl, _, _ = restore_compressed_model(config, strategy, model_builder, config.ckpt_path)

save_path, save_format = get_saving_parameters(config)
compression_ctrl.export_model(save_path, save_format)
Expand Down
Loading