Ctypes Cuda – pointer multiplication does not result in product

I implemented a Cuda matrix multiplication solely in C which successfully runs. Now I am trying to shift the Matrix initialization to numpy and use Python’s ctypes library to execute the c code. It seems like the array with the pointer does not contain the multiplied values. I am not quite sure where the problem lies, but already in the CUDA code – even after calling the Kernel method and loading back the values from device to host, values are still zeroes.

The CUDA code:

#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16
#define RANDOM_MN_RANGE 64

struct Matrix {
    int width;
    int height;
    // contiguously stored Matrix, in row first order
    float *elements;
};

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){

    // runs for each col - row pair
    float tmpVal = 0;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    for (int i = 0; i < A.width; ++i)
        tmpVal += A.elements[row * A.width + i] *
                  B.elements[i * B.width + col];
    C.elements[ row * C.width + col ] = tmpVal;
}

extern "C" {
    void mMul( Matrix *A, Matrix *B, Matrix *C ){

        Matrix d_A, d_B, d_C;

        // Matrix d_A
        d_A.width    =   A->width;
        d_A.height   =   A->height;
        size_t sizeA =   A->width * A->height * sizeof(float);
        // dynamically allocate cudaMemory for elemenst array
        cudaMalloc(&d_A.elements, sizeA);
        cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);

        // Matrix d_B
        d_B.width    =   B->width;
        d_B.height   =   B->height;
        size_t sizeB =   B->width * B->height * sizeof(float);
        // dynamically allocate cudaMemory for elemenst array
        cudaMalloc(&d_B.elements, sizeB);
        cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);

        // Matrix d_C
        d_C.width    =   C->width;
        d_C.height   =   C->height;
        size_t sizeC =   C->width * C->height * sizeof(float);

        // dynamically allocate cudaMemory for elemenst array
        cudaMalloc(&d_C.elements, sizeC);

        // 16 * 16 = 256 threads per block
        dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

        // Blocks per grid
        dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);

        // calling the Kernel
        MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

        // copy results from result matrix C to the host again
        cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);

        // C->elements[0] contains still 0, the values do not seem to be loaded back to host memory.
        printf("A is %fn", A->elements[0]);
        printf("B is %fn", B->elements[0]);
        printf("C is %fn", C->elements[0]);

        // free the cuda memory
        cudaFree(d_A.elements);
        cudaFree(d_B.elements);
        cudaFree(d_C.elements);

    }
}

To compile the code I use the following command

nvcc  --shared --compiler-options '-fPIC' -o Sequential_Cuda_Python.so Sequential_Cuda_Python.cu

ctypes Python code

import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *


class Matrix(Structure):
    _fields_ = [("width", c_int),
                ("height", c_int),
                ("elements", POINTER(c_float))]

libc = CDLL("./Sequential_Cuda_Python.so")

libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
libc.mMul.restype = None


def npArrtoMatrixPtr(data: np.ndarray) -> (POINTER(Matrix), tuple):
    """
    numpy arr to Matrix pointer
    @return (pointer to arr, shape)
    """
    c_float_p = POINTER(c_float)
    data = data.astype(np.float32)

    w, h = np.shape(data)
    # print((w, h))

    mXp = Matrix(height=h, width=w, elements=data.ctypes.data_as(c_float_p))
    return (pointer(mXp), np.shape(data))


def matMulSeqCuda( _mA, _mB, _mC ):
    """
    multiplies mA with mB sequentually using mC
    """

    pmA, mASz = ( _mA[0], _mA[-1] )
    pmB, mBSz = ( _mB[0], _mB[-1] )
    pmC, mCSz = ( _mC[0], _mC[-1] )

    assert np.shape( mASz )[0] == 2 and 
           np.shape( mBSz )[0] == 2 and 
           np.shape( mCSz )[0] == 2, "Only 2D arrays accepted"

    #assert np.shape(mA)[0] == np.shape(mB)[1], "Rows of mA need to be the same as Cols of mB"


    libc.mMul( pmA, pmB, pmC )
    c = pmC.contents

    mtxC = np.ctypeslib.as_array(c.elements, shape=(c.height, c.height))



    # The array still contains only 0. values
    print(mtxC)

    return 0


if __name__ == '__main__':

    a = np.ones( (16, 8) )
    b = np.ones( (16, 8) )
    c = np.zeros( (16, 16) )

    mA = npArrtoMatrixPtr(a)
    mB = npArrtoMatrixPtr(b)
    mC = npArrtoMatrixPtr(c)

    matMulSeqCuda(mA, mB, mC)

Solution:

As @Mark Tolonen pointed out the error lied in the Python script. By calling npArrtoMatrixPptr(creates and returns a Pointer to a Matrix struct) within the same scope as the CUDA function libc.mMul I was able to retrieve the correct resulting Matrix mtxC.

import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *


class Matrix(Structure):
    _fields_ = [("width", c_int),
                ("height", c_int),
                ("elements", POINTER(c_float))]
libc =  CDLL("./Sequential_Cuda_Python.so")

libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
libc.mMul.restype = None

def npArrtoMatrixPtr(data: np.ndarray) -> (POINTER(Matrix), tuple):
    """
    numpy arr to Matrix pointer
    @return (pointer to arr, shape)
    """
    #c_float_p = POINTER(c_float)
    data = data.astype(np.float32)

    h, w = data.shape
    mXp = Matrix(w, h, data.ctypes.data_as(POINTER(c_float)))
    return (pointer(mXp), np.shape(data))


def matMulSeqCuda( npMa, npMb, npMc ):
    """
    multiplies mA with mB sequentually using mC
    """
    assert len(np.shape( npMa )) == 2 and 
           len(np.shape( npMb )) == 2 and 
           len(np.shape( npMc )) == 2, "Only 2D arrays accepted"

    pmA, szA = npArrtoMatrixPtr(npMa)
    pmB, szB = npArrtoMatrixPtr(npMb.T)
    pmC, szC = npArrtoMatrixPtr(npMc) # the resulting array

    libc.mMul( pmA, pmB, pmC )

    c = pmC.contents
    mtxC = np.ctypeslib.as_array(c.elements, shape=(c.height, c.width))

    # the result is correct
    print(mtxC)

    return 0


if __name__ == '__main__':

    a = np.ones( (16, 8) )
    b = np.ones( (16, 8) )
    c = np.zeros( (16, 16) )

    matMulSeqCuda(a, b, c)

Answer

I can’t compile your code as is, but the problem is that np.shape returns (rows,columns) or the equivalent (height,width), not (width,height):

w, h = np.shape(data) # should be h,w

And also on visual inspection this line is wrong (c.height twice).

mtxC = np.ctypeslib.as_array(c.elements, shape=(c.height, c.height))

The CUDA code is extraneous to the question, which is passing a Matrix* correctly and receiving back the modifications. I made a minimal reproducible example below that concentrates on passing a Matrix correctly:

test.cpp – receives a Matrix structure and doubles the values in it.

#ifdef _WIN32
#   define API __declspec(dllexport)
#else
#   define API
#endif

struct Matrix {
    int width;
    int height;
    float *elements;
};

extern "C" API
void doubleit(Matrix *A) {
    for(int r = 0; r < A->height; ++r)
        for(int c = 0; c < A->width; ++c) {
            A->elements[r * A->width + c] *= 2;
    }
}

test.py

import numpy as np
from ctypes import *

class Matrix(Structure):
    _fields_ = [("width", c_int),
                ("height", c_int),
                ("elements", POINTER(c_float))]

libc = CDLL('./test')

libc.doubleit.argtypes = POINTER(Matrix),
libc.doubleit.restype = None

def doubleit(a):
    h,w = a.shape # note h,w not w,h
    m = Matrix(w,h,a.ctypes.data_as(POINTER(c_float)))
    libc.doubleit(m)

a = np.arange(0.0,0.6,0.1,dtype=np.float32).reshape((2,3))
print(a)
doubleit(a)
print(a)

Output:

[[0.  0.1 0.2]
 [0.3 0.4 0.5]]
[[0.  0.2 0.4]
 [0.6 0.8 1. ]]