triton.Config({'BLOCK_SIZE_M':128,'BLOCK_SIZE_N':128,'BLOCK_SIZE_K':32,'GROUP_SIZE_M':8},num_stages=4,num_warps=4), triton.Config({'BLOCK_SIZE_M':128,'BLOCK_SIZE_N':64,'BLOCK_SIZE_K':32,'GROUP_SIZE_M':8},num_stages=4,num_warps=4), triton.Config({'BLOCK_SIZE_M':6...
GROUP_SIZE_M = 64 if N <= 8192: GROUP_SIZE_M = 96 if N <= 4096: GROUP_SIZE_M = 128 if N <= 1024: GROUP_SIZE_M = 256 # allocate output # 分配输出 locks = torch.zeros(2 * GROUP_SIZE_M, dtype=torch.int32, device=w.device) _dw = torch.zeros((GROUP_SIZE_M, N), dty...
_dw, _db, dw, db,min(GROUP_SIZE_M, M), N,#BLOCK_SIZE_M=32,#BLOCK_SIZE_N=128, num_ctas=1)returndx,None, dw, db,Nonelayer_norm = LayerNorm.applydeftest_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):# create data# 创建数据x_shape = (M, N) w_shape = (x_shape...
first_pid_m = group_id * GROUP_SIZE_M # If `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller # 如果 `num_pid_m` 不能被 `GROUP_SIZE_M` 整除,最后一组会比较小 group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) # *Within groups*, progra...
num_pid_m = tl # Number of programs ids along the N axis # N 轴上程序 id 的数量 num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) # Number of programs in group # 组中程序数量 num_pid_in_group = GROUP_SIZE_M * num_pid_n # Id of the group this program is in ...
group_size_m=min(num_pid_m-first_pid_m,GROUP_SIZE_M) pid_m=first_pid_m+(pid%group_size_m) pid_n=(pid%num_pid_in_group)//group_size_m #--- #CreatepointersforthefirstblocksofAandB. #WewilladvancethispointeraswemoveintheKdirection...
Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1, 'waves_per_eu': 2}, num_warps=4, num_stages=0), triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 4, 'waves_per_eu': 2}...
BLOCK_SIZE_N:tl.constexpr,# BLOCK_SIZE_K:tl.constexpr,# GROUP_SIZE_M:tl.constexpr,#):pid=tl.program_id(axis=0)num_pid_m=tl.cdiv(M,BLOCK_SIZE_M)num_pid_n=tl.cdiv(N,BLOCK_SIZE_N)num_pid_in_group=GROUP_SIZE_M*num_pid_n ...
GROUP_SIZE_M:tl.constexpr,BLOCK_SIZE_N:tl.constexpr):#Map the program id to the elements of X,DX,and DY it should compute.# 映射程序 id 到对应计算的 X,DX,DY row=tl.program_id(0)cols=tl.arange(0,BLOCK_SIZE_N)mask=cols<N ...
group_a_ptrs,group_b_ptrs,group_c_ptrs,# device tensorofgemm sizes.its shape is[group_size,3]# 设备张量的GEMM(General Matrix Multiply)大小。其形状为[group_size,3]# dim0is group_size,dim1is the valuesof<M,N,K>ofeach gemm