d))V_mat=torch.rand((N,d))# 执行标准的pytorch softmax和attention计算expected_softmax=torch.softmax(Q_mat@K_mat.T,dim=1)expected_attention=expected_softmax@V_mat## 执行safe softmax和attention计算# 1st readS_mat=Q_mat@K_mat.Trow_max=torch.max(S_mat,dim=1).values[:,None]...
首先,回顾一下传统FlashAttention的核心计算逻辑: # acc_s: [block_M, block_N]# scores_max: [block_M]# scores_scale: [block_M]# acc_o: [block_M, dim]foriinrange(loop_range):acc_s=Q@K[i]scores_max_prev=scores_maxscores_max=max(acc_s,dim=1)scores_scale=exp(scores_max_prev-sco...