Description
...
Loading extension module cpu_adam...
Traceback (most recent call last):
File "/home/mahongli/LMFlow/examples/finetune.py", line 61, in
main()
File "/home/mahongli/LMFlow/examples/finetune.py", line 57, in main
tuned_model = finetuner.tune(model=model, dataset=dataset)
File "/home/mahongli/LMFlow/src/lmflow/pipeline/finetuner.py", line 285, in tune
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/transformers/trainer.py", line 1639, in train
return inner_training_loop(
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/transformers/trainer.py", line 1708, in _inner_training_loop
deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/transformers/deepspeed.py", line 378, in deepspeed_init
deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/init.py", line 125, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 340, in init
self._configure_optimizer(optimizer, model_parameters)
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1283, in _configure_optimizer
basic_optimizer = self._configure_basic_optimizer(model_parameters)
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1354, in _configure_basic_optimizer
optimizer = DeepSpeedCPUAdam(model_parameters,
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 96, in init
self.ds_opt_adam = CPUAdamBuilder().load()
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 485, in load
return self.jit_load(verbose)
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 520, in jit_load
op_module = load(
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1284, in load
return _jit_compile(
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1535, in _jit_compile
return _import_module_from_library(name, build_directory, is_python_module)
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1929, in _import_module_from_library
module = importlib.util.module_from_spec(spec)
File "", line 565, in module_from_spec
File "", line 1173, in create_module
File "", line 228, in _call_with_frames_removed
ImportError: /home/mahongli/.cache/torch_extensions/py39_cu117/cpu_adam/cpu_adam.so: cannot open shared object file: No such file or directory
Exception ignored in: <function DeepSpeedCPUAdam.del at 0x7f915c061820>
Traceback (most recent call last):
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 110, in del
self.ds_opt_adam.destroy_adam(self.opt_id)
AttributeError: 'DeepSpeedCPUAdam' object has no attribute 'ds_opt_adam'
Exception ignored in: <function DeepSpeedCPUAdam.del at 0x7f2f180db820>
Traceback (most recent call last):
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 110, in del
self.ds_opt_adam.destroy_adam(self.opt_id)
AttributeError: 'DeepSpeedCPUAdam' object has no attribute 'ds_opt_adam'
Exception ignored in: <function DeepSpeedCPUAdam.del at 0x7f0548ddd820>
Traceback (most recent call last):
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 110, in del
self.ds_opt_adam.destroy_adam(self.opt_id)
AttributeError: 'DeepSpeedCPUAdam' object has no attribute 'ds_opt_adam'
Exception ignored in: <function DeepSpeedCPUAdam.del at 0x7fc24b130820>
Traceback (most recent call last):
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 110, in del
self.ds_opt_adam.destroy_adam(self.opt_id)
AttributeError: 'DeepSpeedCPUAdam' object has no attribute 'ds_opt_adam'
Exception ignored in: <function DeepSpeedCPUAdam.del at 0x7f4d96660820>
Traceback (most recent call last):
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 110, in del
AttributeError: 'DeepSpeedCPUAdam' object has no attribute 'ds_opt_adam'
Exception ignored in: <function DeepSpeedCPUAdam.del at 0x7f05c8043820>
Traceback (most recent call last):
File "/home/mahongli/anaconda3/envs/lmflow/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 110, in del
self.ds_opt_adam.destroy_adam(self.opt_id)
AttributeError: 'DeepSpeedCPUAdam' object has no attribute 'ds_opt_adam'
[2023-06-05 21:56:22,835] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 1368214
[2023-06-05 21:56:23,637] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 1368215
[2023-06-05 21:56:23,719] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 1368216
[2023-06-05 21:56:23,719] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 1368217
[2023-06-05 21:56:23,776] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 1368221
[2023-06-05 21:56:23,816] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 1368223
[2023-06-05 21:56:23,859] [ERROR] [launch.py:324:sigkill_handler] ['/home/mahongli/anaconda3/envs/lmflow/bin/python', '-u', 'examples/finetune.py', '--local_rank=5', '--model_name_or_path', 'bigscience/bloom-560m', '--dataset_path', '/home/mahongli/LMFlow/data/alpaca/train', '--output_dir', '/home/mahongli/LMFlow/output_models/finetune', '--overwrite_output_dir', '--num_train_epochs', '1', '--learning_rate', '2e-5', '--block_size', '256', '--per_device_train_batch_size', '1', '--deepspeed', 'configs/ds_config_zero3.json', '--bf16', '--run_name', 'finetune', '--validation_split_percentage', '0', '--logging_steps', '20', '--do_train', '--ddp_timeout', '72000', '--save_steps', '5000', '--dataloader_num_workers', '1'] exits with return code = 1�
Detail
I am running on a Linux Server with 6 A100,
Additional context
(lmflow) mahongli@ops-NF5468M6:~/LMFlow$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Thu_Nov_18_09:45:30_PST_2021
Cuda compilation tools, release 11.5, V11.5.119
Build cuda_11.5.r11.5/compiler.30672275_0�
