You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I try the script :Breadcrumbstorchscale/examples LongNet Model,but meet issue:
/fairseq/(torchscale) :~/data/results/fairseq$ torchrun --nproc_per_node=8 --master_port 29501 --nnodes=1 train.py /home/data/dataset/yehuicheng/LongNet_example/DNA_example/longnet_example --num-workers 0 --activation-fn gelu --share-decoder-input-output-embed --validate-interval-updates 1000 --save-interval-updates 1000 --no-epoch-checkpoints --memory-efficient-fp16 --fp16-init-scale 4 --arch transformer --task language_modeling --sample-break-mode none --tokens-per-sample 4096 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 --clip-norm 0.0 --lr 5e-4 --lr-scheduler polynomial_decay --warmup-updates 750 --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 --batch-size 4 --update-freq 1 --required-batch-size-multiple 1 --total-num-update 50000 --max-update 50000 --seed 1 --ddp-backend=c10d --flash-attention --segment-length [2048,4096] --dilated-ratio [1,2]
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779]
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779] *****************************************
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779] *****************************************
Traceback (most recent call last):
File "train.py", line 12, in
from fairseq_cli.train import cli_main
File "/data/results/yehuicheng/fairseq/fairseq_cli/train.py", line 28, in
from fairseq import (
File "/data/results/yehuicheng/fairseq/fairseq/init.py", line 32, in
import fairseq.criterions # noqa
File "/data/results/yehuicheng/fairseq/fairseq/criterions/init.py", line 36, in
importlib.import_module("fairseq.criterions." + file_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/criterions/ctc.py", line 19, in
from fairseq.tasks import FairseqTask
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 136, in
import_tasks(tasks_dir, "fairseq.tasks")
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 117, in import_tasks
importlib.import_module(namespace + "." + task_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/tasks/pretraining.py", line 15, in
import sentencepiece as spm
ModuleNotFoundError: No module named 'sentencepiece'
Traceback (most recent call last):
File "train.py", line 12, in
from fairseq_cli.train import cli_main
File "/data/results/yehuicheng/fairseq/fairseq_cli/train.py", line 28, in
from fairseq import (
File "/data/results/yehuicheng/fairseq/fairseq/init.py", line 32, in
import fairseq.criterions # noqa
File "/data/results/yehuicheng/fairseq/fairseq/criterions/init.py", line 36, in
importlib.import_module("fairseq.criterions." + file_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/criterions/ctc.py", line 19, in
from fairseq.tasks import FairseqTask
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 136, in
import_tasks(tasks_dir, "fairseq.tasks")
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 117, in import_tasks
importlib.import_module(namespace + "." + task_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/tasks/pretraining.py", line 15, in
import sentencepiece as spm
ModuleNotFoundError: No module named 'sentencepiece'
Traceback (most recent call last):
File "train.py", line 12, in
from fairseq_cli.train import cli_main
File "/data/results/yehuicheng/fairseq/fairseq_cli/train.py", line 28, in
from fairseq import (
File "/data/results/yehuicheng/fairseq/fairseq/init.py", line 32, in
import fairseq.criterions # noqa
File "/data/results/yehuicheng/fairseq/fairseq/criterions/init.py", line 36, in
importlib.import_module("fairseq.criterions." + file_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/criterions/ctc.py", line 19, in
from fairseq.tasks import FairseqTask
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 136, in
import_tasks(tasks_dir, "fairseq.tasks")
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 117, in import_tasks
importlib.import_module(namespace + "." + task_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/tasks/pretraining.py", line 15, in
import sentencepiece as spm
ModuleNotFoundError: No module named 'sentencepiece'
W1107 16:47:17.969652 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288674 closing signal SIGTERM
W1107 16:47:17.970422 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288675 closing signal SIGTERM
W1107 16:47:17.970967 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288676 closing signal SIGTERM
W1107 16:47:17.971170 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288678 closing signal SIGTERM
W1107 16:47:17.971362 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288679 closing signal SIGTERM
W1107 16:47:17.971545 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288681 closing signal SIGTERM
W1107 16:47:17.971737 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288684 closing signal SIGTERM
E1107 16:47:18.650703 139642846356096 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 3288673) of binary: /home/yehuicheng/miniconda3/envs/torchscale/bin/python3.8
Traceback (most recent call last):
File "/home/yehuicheng/miniconda3/envs/torchscale/bin/torchrun", line 8, in
sys.exit(main())
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
I try the script :Breadcrumbstorchscale/examples LongNet Model,but meet issue:
/fairseq/(torchscale) :~/data/results/fairseq$ torchrun --nproc_per_node=8 --master_port 29501 --nnodes=1 train.py /home/data/dataset/yehuicheng/LongNet_example/DNA_example/longnet_example --num-workers 0 --activation-fn gelu --share-decoder-input-output-embed --validate-interval-updates 1000 --save-interval-updates 1000 --no-epoch-checkpoints --memory-efficient-fp16 --fp16-init-scale 4 --arch transformer --task language_modeling --sample-break-mode none --tokens-per-sample 4096 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 --clip-norm 0.0 --lr 5e-4 --lr-scheduler polynomial_decay --warmup-updates 750 --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 --batch-size 4 --update-freq 1 --required-batch-size-multiple 1 --total-num-update 50000 --max-update 50000 --seed 1 --ddp-backend=c10d --flash-attention --segment-length [2048,4096] --dilated-ratio [1,2]
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779]
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779] *****************************************
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779] *****************************************
Traceback (most recent call last):
File "train.py", line 12, in
from fairseq_cli.train import cli_main
File "/data/results/yehuicheng/fairseq/fairseq_cli/train.py", line 28, in
from fairseq import (
File "/data/results/yehuicheng/fairseq/fairseq/init.py", line 32, in
import fairseq.criterions # noqa
File "/data/results/yehuicheng/fairseq/fairseq/criterions/init.py", line 36, in
importlib.import_module("fairseq.criterions." + file_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/criterions/ctc.py", line 19, in
from fairseq.tasks import FairseqTask
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 136, in
import_tasks(tasks_dir, "fairseq.tasks")
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 117, in import_tasks
importlib.import_module(namespace + "." + task_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/tasks/pretraining.py", line 15, in
import sentencepiece as spm
ModuleNotFoundError: No module named 'sentencepiece'
Traceback (most recent call last):
File "train.py", line 12, in
from fairseq_cli.train import cli_main
File "/data/results/yehuicheng/fairseq/fairseq_cli/train.py", line 28, in
from fairseq import (
File "/data/results/yehuicheng/fairseq/fairseq/init.py", line 32, in
import fairseq.criterions # noqa
File "/data/results/yehuicheng/fairseq/fairseq/criterions/init.py", line 36, in
importlib.import_module("fairseq.criterions." + file_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/criterions/ctc.py", line 19, in
from fairseq.tasks import FairseqTask
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 136, in
import_tasks(tasks_dir, "fairseq.tasks")
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 117, in import_tasks
importlib.import_module(namespace + "." + task_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/tasks/pretraining.py", line 15, in
import sentencepiece as spm
ModuleNotFoundError: No module named 'sentencepiece'
Traceback (most recent call last):
File "train.py", line 12, in
from fairseq_cli.train import cli_main
File "/data/results/yehuicheng/fairseq/fairseq_cli/train.py", line 28, in
from fairseq import (
File "/data/results/yehuicheng/fairseq/fairseq/init.py", line 32, in
import fairseq.criterions # noqa
File "/data/results/yehuicheng/fairseq/fairseq/criterions/init.py", line 36, in
importlib.import_module("fairseq.criterions." + file_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/criterions/ctc.py", line 19, in
from fairseq.tasks import FairseqTask
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 136, in
import_tasks(tasks_dir, "fairseq.tasks")
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 117, in import_tasks
importlib.import_module(namespace + "." + task_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/tasks/pretraining.py", line 15, in
import sentencepiece as spm
ModuleNotFoundError: No module named 'sentencepiece'
W1107 16:47:17.969652 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288674 closing signal SIGTERM
W1107 16:47:17.970422 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288675 closing signal SIGTERM
W1107 16:47:17.970967 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288676 closing signal SIGTERM
W1107 16:47:17.971170 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288678 closing signal SIGTERM
W1107 16:47:17.971362 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288679 closing signal SIGTERM
W1107 16:47:17.971545 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288681 closing signal SIGTERM
W1107 16:47:17.971737 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288684 closing signal SIGTERM
E1107 16:47:18.650703 139642846356096 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 3288673) of binary: /home/yehuicheng/miniconda3/envs/torchscale/bin/python3.8
Traceback (most recent call last):
File "/home/yehuicheng/miniconda3/envs/torchscale/bin/torchrun", line 8, in
sys.exit(main())
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
train.py FAILED
Failures:
<NO_OTHER_FAILURES>
Root Cause (first observed failure):
[0]:
time : 2024-11-07_16:47:17
host : bdp-gpu04.bdp.biosino.org
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 3288673)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
The text was updated successfully, but these errors were encountered: