-
Notifications
You must be signed in to change notification settings - Fork 219
Open
Description
I try the script :Breadcrumbstorchscale/examples LongNet Model,but meet issue:
/fairseq/(torchscale) :~/data/results/fairseq$ torchrun --nproc_per_node=8 --master_port 29501 --nnodes=1 train.py /home/data/dataset/yehuicheng/LongNet_example/DNA_example/longnet_example --num-workers 0 --activation-fn gelu --share-decoder-input-output-embed --validate-interval-updates 1000 --save-interval-updates 1000 --no-epoch-checkpoints --memory-efficient-fp16 --fp16-init-scale 4 --arch transformer --task language_modeling --sample-break-mode none --tokens-per-sample 4096 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 --clip-norm 0.0 --lr 5e-4 --lr-scheduler polynomial_decay --warmup-updates 750 --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 --batch-size 4 --update-freq 1 --required-batch-size-multiple 1 --total-num-update 50000 --max-update 50000 --seed 1 --ddp-backend=c10d --flash-attention --segment-length [2048,4096] --dilated-ratio [1,2]
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779]
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779] *****************************************
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W1107 16:47:02.628910 139642846356096 torch/distributed/run.py:779] *****************************************
Traceback (most recent call last):
File "train.py", line 12, in
from fairseq_cli.train import cli_main
File "/data/results/yehuicheng/fairseq/fairseq_cli/train.py", line 28, in
from fairseq import (
File "/data/results/yehuicheng/fairseq/fairseq/init.py", line 32, in
import fairseq.criterions # noqa
File "/data/results/yehuicheng/fairseq/fairseq/criterions/init.py", line 36, in
importlib.import_module("fairseq.criterions." + file_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/criterions/ctc.py", line 19, in
from fairseq.tasks import FairseqTask
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 136, in
import_tasks(tasks_dir, "fairseq.tasks")
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 117, in import_tasks
importlib.import_module(namespace + "." + task_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/tasks/pretraining.py", line 15, in
import sentencepiece as spm
ModuleNotFoundError: No module named 'sentencepiece'
Traceback (most recent call last):
File "train.py", line 12, in
from fairseq_cli.train import cli_main
File "/data/results/yehuicheng/fairseq/fairseq_cli/train.py", line 28, in
from fairseq import (
File "/data/results/yehuicheng/fairseq/fairseq/init.py", line 32, in
import fairseq.criterions # noqa
File "/data/results/yehuicheng/fairseq/fairseq/criterions/init.py", line 36, in
importlib.import_module("fairseq.criterions." + file_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/criterions/ctc.py", line 19, in
from fairseq.tasks import FairseqTask
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 136, in
import_tasks(tasks_dir, "fairseq.tasks")
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 117, in import_tasks
importlib.import_module(namespace + "." + task_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/tasks/pretraining.py", line 15, in
import sentencepiece as spm
ModuleNotFoundError: No module named 'sentencepiece'
Traceback (most recent call last):
File "train.py", line 12, in
from fairseq_cli.train import cli_main
File "/data/results/yehuicheng/fairseq/fairseq_cli/train.py", line 28, in
from fairseq import (
File "/data/results/yehuicheng/fairseq/fairseq/init.py", line 32, in
import fairseq.criterions # noqa
File "/data/results/yehuicheng/fairseq/fairseq/criterions/init.py", line 36, in
importlib.import_module("fairseq.criterions." + file_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/criterions/ctc.py", line 19, in
from fairseq.tasks import FairseqTask
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 136, in
import_tasks(tasks_dir, "fairseq.tasks")
File "/data/results/yehuicheng/fairseq/fairseq/tasks/init.py", line 117, in import_tasks
importlib.import_module(namespace + "." + task_name)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "/data/results/yehuicheng/fairseq/fairseq/tasks/pretraining.py", line 15, in
import sentencepiece as spm
ModuleNotFoundError: No module named 'sentencepiece'
W1107 16:47:17.969652 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288674 closing signal SIGTERM
W1107 16:47:17.970422 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288675 closing signal SIGTERM
W1107 16:47:17.970967 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288676 closing signal SIGTERM
W1107 16:47:17.971170 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288678 closing signal SIGTERM
W1107 16:47:17.971362 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288679 closing signal SIGTERM
W1107 16:47:17.971545 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288681 closing signal SIGTERM
W1107 16:47:17.971737 139642846356096 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3288684 closing signal SIGTERM
E1107 16:47:18.650703 139642846356096 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 3288673) of binary: /home/yehuicheng/miniconda3/envs/torchscale/bin/python3.8
Traceback (most recent call last):
File "/home/yehuicheng/miniconda3/envs/torchscale/bin/torchrun", line 8, in
sys.exit(main())
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/yehuicheng/miniconda3/envs/torchscale/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
train.py FAILED
Failures:
<NO_OTHER_FAILURES>
Root Cause (first observed failure):
[0]:
time : 2024-11-07_16:47:17
host : bdp-gpu04.bdp.biosino.org
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 3288673)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Metadata
Metadata
Assignees
Labels
No labels