_process_group_backend, timeout=self._timeout) def _get_process_group_backend(self) -> str: assert self.root_device.type == "hpu" return "hccl" return self._process_group_backend def set_world_ranks(self) -> None: if self.cluster_environment is not None:...
distributed_c10d.py", line 510, in init_process_group timeout=timeout)) File "/home/user1/miniconda3/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 603, in _new_process_group_helper timeout) RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs ...
timeout 4.0.2 attrs 22.1.0 audioread 3.0.0 Babel 2.11.0 backcall 0.2.0 backoff 2.2.1 base58 2.1.1 beautifulsoup4 4.11.1 bertviz 1.4.0 binaryornot 0.4.4 bitsandbytes 0.37.0 black 23.3.0 bleach 5.0.1 blessed 1.20.0 bokeh 2.4.3 boto3 1.26.64 botocore 1.29.64 Brotli 1.0.9 brotli...
And when the remote nodes eventually time out, this is the stack trace: File "lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _fit_impl self._run(model, ckpt_path=self.ckpt_path) File "lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line...