/bin/bash#SBATCH --job-name=torchrun-test#SBATCH --partition=gpu_batch#SBATCH --nodes=2#SBATCH --gres=gpu:1#SBATCH --time=00:10:00#SBATCH --ntasks-per-node=1#SBATCH --output=sbatch-%N-%j.out#SBATCH --error=sbatch-%N-%j.err#Environment infoGPUS_PER_NODE=$SLURM_GPUS_ON_NODEN...
(log_dir=None, master_addr='localhost', master_port=2222, max_restarts=0, module=False, monitor_interval=5, nnodes='2', no_python=False, node_rank=0, nproc_per_node='1', rdzv_backen d='static', rdzv_conf='', rdzv_endpoint='', rdzv_id='none', redirects='0', role='...
store, rank, world_size = next(rendezvous_iterator) File"/opt/conda/lib/python3.8/site-packages/torch/distributed/rendezvous.py", line 232,in_env_rendezvous_handler store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout) File"/opt/conda/lib/python3.8/site-package...