Making linalg.matmul to GPU runnable code

So, I was trying with this MLIR code:

module {
    func @matmul_linalg(%A: memref<8x8xf32>, %B: memref<8x8xf32>, %C: memref<8x8xf32>) {
        linalg.matmul ins(%A, %B : memref<8x8xf32>, memref<8x8xf32>)
            outs(%C: memref<8x8xf32>)
        return
    }

    func @main() {
        %A = memref.alloc() : memref<8x8xf32>
        %B = memref.alloc() : memref<8x8xf32>
        %C = memref.alloc() : memref<8x8xf32>
        
        %cf1 = constant 1.0 : f32
        
        linalg.fill(%A, %cf1) : memref<8x8xf32>, f32
        linalg.fill(%B, %cf1) : memref<8x8xf32>, f32
        linalg.fill(%C, %cf1) : memref<8x8xf32>, f32
        
        call @matmul_linalg(%A, %B, %C) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> ()
        return
    }
}

and this is my pass:

mlir-opt matmul-gpu-02.mlir.in \
    --linalg-tile-to-parallel-loops="linalg-tile-sizes=4,2" \
    --convert-linalg-to-parallel-loops \
    --test-gpu-greedy-parallel-loop-mapping \
    --convert-parallel-loops-to-gpu \
    --gpu-kernel-outlining \
    --lower-affine \
    --convert-scf-to-std \
    --canonicalize \
    --pass-pipeline="gpu.module(strip-debuginfo, convert-gpu-to-nvvm, gpu-to-cubin)" \
    --gpu-to-llvm 2>&1 >matmul-gpu-02.mlir.out

and this is how I’m generating the object:

mlir-translate matmul-gpu-02.mlir.out --mlir-to-llvmir | opt -O3 -S | llc -O3 | as - -o matmul-gpu-02.mlir.o

I didn’t get any complain up to this point, but when I was trying to generate the executable –

clang++-11 matmul-gpu-02.mlir.o -lcuda \
    $HOME/opt/llvm/lib/libmlir_cuda_runtime.so \
    $HOME/opt/llvm/lib/libmlir_runner_utils.so \
    -o matmul-gpu-02

I get these errors –

'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleGetFunction(&function, module, name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleGetFunction(&function, module, name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleGetFunction(&function, module, name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_INVALID_HANDLE'

Any idea?

I have the same problem.
is there any update??

just info

  • llvm-project repo commit id: 638dcea010cfc280f428d0cc13f4aa8578a1d69d.
  • os: cuda11.3 container with nvidia-docker on WSL2.

The problem is that the allocations are not available on the GPU by default. If you add code to register them it should work.
This code should work:

module {
    func @matmul_linalg(%A: memref<8x8xf32>, %B: memref<8x8xf32>, %C: memref<8x8xf32>) {
        linalg.matmul ins(%A, %B : memref<8x8xf32>, memref<8x8xf32>)
            outs(%C: memref<8x8xf32>)
        return
    }
    func @main() {
        %A = memref.alloc() : memref<8x8xf32>
        %B = memref.alloc() : memref<8x8xf32>
        %C = memref.alloc() : memref<8x8xf32>

        %cf1 = constant 1.0 : f32


        %AC = memref.cast %A : memref<8x8xf32> to memref<*xf32>
        %BC = memref.cast %B : memref<8x8xf32> to memref<*xf32>
        %CC = memref.cast %C : memref<8x8xf32> to memref<*xf32>

        gpu.host_register %AC : memref<*xf32>
        gpu.host_register %BC : memref<*xf32>
        gpu.host_register %CC : memref<*xf32>

      
       linalg.fill(%cf1, %A) : f32, memref<8x8xf32>
       linalg.fill(%cf1, %B) : f32, memref<8x8xf32>
       linalg.fill(%cf1, %C) : f32, memref<8x8xf32>


        call @matmul_linalg(%A, %B, %C) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> ()
        return
    }
}

@ThomasRaoux
Thanks, but I found errors similar to the first reported ones after I fixed code like yours.
It seems to contain some system kind of problem, doesn’t it?
I just kept my build commands below.

# I saved the previous program as matmul.mlir
# optimization to matmul.opt.mlir
mlir-opt matmul.mlir  --linalg-tile-to-parallel-loops="linalg-tile-sizes=4,2" --convert-linalg-to-parallel-loops  --test-gpu-greedy-parallel-loop-mapping  --convert-parallel-loops-to-gpu --gpu-kernel-outlining --lower-affine --convert-scf-to-std --canonicalize --pass-pipeline="gpu.module(strip-debuginfo, convert-gpu-to-nvvm, gpu-to-cubin)" --gpu-to-llvm > matmul.opt.mlir
# build executable 'matmul.elf' including main()
mlir-translate matmul.opt.mlir --mlir-to-llvmir | opt -O3 -S | llc -O3 | clang -x assembler - -o matmul.elf -L llvm/lib -l mlir_cuda_runtime -l mlir_runner_utils
# run the executable
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleGetFunction(&function, module, name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleGetFunction(&function, module, name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleGetFunction(&function, module, name)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_INVALID_HANDLE'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_INVALID_HANDLE'

I just tried mlir-cpu-runner but it returned the same error.

mlir-cpu-runner matmul.opt.mlir --shared-libs=llvm/lib/libmlir_cuda_runtime.so --shared-libs=llvm/lib/libmlir_runner_utils.so --entry-point-result=void

This could be the same issue as discussed in a recently reported bug. Older CUDA architectures require manual translation of pointers. Could this be the case here?

mlir-cpu-runner matmul-gpu-02.mlir.out \
      $HOME/opt/llvm/lib/libmlir_cuda_runtime.so \
      $HOME/opt/llvm/lib/libmlir_runner_utils.so \
      --entry-point-result=void

use mlir-cpu-runner to run your mlir.out

In my case, manually registering the memrefs accessed by GPU via gpu.host_register worked. Hope this helps.