cudaMemcpyAsync(dev_a0, host_a + i, N*sizeof(int),cudaMemcpyHostToDevice, stream0); cudaMemcpyAsync(dev_a1, host_a + i + N, N*sizeof(int),cudaMemcpyHostToDevice, stream1); cudamemcpyAsync(dev_b0, host_b + i, N*sizeof(int),cudaMemcpyHostToDevice, stream0); cudamemcpyAsync(dev_b1...