GROUP_SIZE_M = 64 if N <= 8192: GROUP_SIZE_M = 96 if N <= 4096: GROUP_SIZE_M = 128 if N <= 1024: GROUP_SIZE_M = 256 # allocate output # 分配输出 locks = torch.zeros(2 * GROUP_SIZE_M, dtype=torch.int32, device=w.device) _dw = torch.zeros((GROUP_SIZE_M, N), dty...
first_pid_m = group_id * GROUP_SIZE_M # If `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller # 如果 `num_pid_m` 不能被 `GROUP_SIZE_M` 整除,最后一组会比较小 group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) # *Within groups*, progra...
triton.Config({'BLOCK_SIZE_M':128,'BLOCK_SIZE_N':128,'BLOCK_SIZE_K':32,'GROUP_SIZE_M':8},num_stages=4,num_warps=4), triton.Config({'BLOCK_SIZE_M':128,'BLOCK_SIZE_N':64,'BLOCK_SIZE_K':32,'GROUP_SIZE_M':8},num_stages=4,num_warps=4), triton.Config({'BLOCK_SIZE_M':6...
triton.Config({'BLOCK_SIZE_M':128,'BLOCK_SIZE_N':256,'BLOCK_SIZE_K':64,'GROUP_SIZE_M':8},num_stages=3,num_warps=8), triton.Config({'BLOCK_SIZE_M':64,'BLOCK_SIZE_N':256,'BLOCK_SIZE_K':32,'GROUP_SIZE_M':8},num_stages=4,num_warps=4), triton.Config({'BLOCK_SIZE_M':1...
在上面的示例A(9, 9) @ B(9, 9) = C(9, 9)中,因为BLOCK_SIZE_M 、BLOCK_SIZE_N以及BLOCK_SIZE_K的设置都是3,并且实际上在上面的例子中GROUP_SIZE_M的设置是1,实在太特殊,所以我们再举个数值上更能体现分组的例子。 A(2048, 2048) @ B(2048, 2048) = C(2048, 2048),其中因为我们预先认为 ...
GROUP_SIZE_M:tl.constexpr,BLOCK_SIZE_N:tl.constexpr):#Map the program id to the elements of X,DX,and DY it should compute.# 映射程序 id 到对应计算的 X,DX,DY row=tl.program_id(0)cols=tl.arange(0,BLOCK_SIZE_N)mask=cols<N ...
Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1, 'waves_per_eu': 2}, num_warps=4, num_stages=0), triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 4, 'waves_per_eu': 2}...
WITH THE# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.import torchimport tritonimport triton.language as tl@triton.autotune( configs=[ triton.Config({ 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'NUM_SM'...
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), # 如果想加更多配置,就在这triton.Config ] # 获取 CUDA 的自动调优配置 def get_autotune_config(): ...
Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2), ], key=['M', 'N', 'K'], ) 然后通过调用Triton的do_bench就可以将你写的算子跑起来了,do_bench处在python/triton/testing.py下,其中会对每个kernel进行25...