vit base, large, huge: patch大小,向量维度,encoder深度, encoder注意力头数不同。 def mae_vit_base_patch16_dec512d8b(**kwargs): model = MaskedAutoencoderViT( patch_size=16, embed_dim=768, depth=12, num_heads=12, decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, mlp_rati...
mask_ratio) pred = self.forward_decoder(latent, ids_restore) # [N, L, p*p*3] loss = self.forward_loss(imgs, pred, mask) return loss, pred, mask # dec512d8b -> decoder: 512 dim, 8 blocks def mae_vit_b_p16_dec
def mae_vit_base_patch16_dec512d8b(**kwargs): model = MaskedAutoencoderViT( patch_size=16, embed_dim=768, depth=12, num_heads=12, decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16, mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) return model ...
(nn.LayerNorm, epsilon=1e-6), **kwargs) return model if __name__ == '__main__': m = mae_vit_b_p16_dec512d8b(img_size=32, patch_size=4) x = paddle.randn([1,3,32,32]) loss,pred,mask = m(x, mask_ratio=0.75) print('==> mae pretrain loss:', loss) print('==>...