{'fsdp_auto_wrap_policy': 'TRANSFORMER_BASED_WRAP', 'fsdp_backward_prefetch_policy': 'BACKWARD_PRE', 'fsdp_forward_prefetch': False, 'fsdp_offload_params': False, 'fsdp_sharding_strategy': 1, 'fsdp_state_dict_type': 'SHARDED_STATE_DICT', 'fsdp_sync_module_states': True, 'fsdp_...
model.named_parameters(): if p.requires_grad == False: not_trainable.append(p) sharding_strategy=torch.distributed.fsdp.ShardingStrategy._HYBRID_SHARD_ZERO2 model = FSDP(model, sharding_strategy=sharding_strategy, ignored_parameters = not_trainable, auto_wrap_policy=size_based_auto_wrap_policy) ...