[ACCEPTED]-Thrust inside user written kernels-thrust

Accepted answer
Score: 54

As it was originally written, Thrust is 24 purely a host side abstraction. It cannot 23 be used inside kernels. You can pass the 22 device memory encapsulated inside a thrust::device_vector to 21 your own kernel like this:

thrust::device_vector< Foo > fooVector;
// Do something thrust-y with fooVector

Foo* fooArray = thrust::raw_pointer_cast( fooVector.data() );

// Pass raw array and its size to kernel
someKernelCall<<< x, y >>>( fooArray, fooVector.size() );

and you can also 20 use device memory not allocated by thrust 19 within thrust algorithms by instantiating 18 a thrust::device_ptr with the bare cuda 17 device memory pointer.

Edited four and half years later to add that as per 16 @JackOLantern's answer, thrust 1.8 adds 15 a sequential execution policy which means 14 you can run single threaded versions of 13 thrust's alogrithms on the device. Note 12 that it still isn't possible to directly 11 pass a thrust device vector to a kernel 10 and device vectors can't be directly used 9 in device code.

Note that it is also possible 8 to use the thrust::device execution policy in some cases 7 to have parallel thrust execution launched 6 by a kernel as a child grid. This requires 5 separate compilation/device linkage and 4 hardware which supports dynamic parallelism. I 3 am not certain whether this is actually 2 supported in all thrust algorithms or not, but 1 certainly works with some.

Score: 16

This is an update to my previous answer.

Starting 15 from Thrust 1.8.1, CUDA Thrust primitives 14 can be combined with the thrust::device execution policy 13 to run in parallel within a single CUDA 12 thread exploiting CUDA dynamic parallelism. Below, an example 11 is reported.

#include <stdio.h>

#include <thrust/reduce.h>
#include <thrust/execution_policy.h>

#include "TimingGPU.cuh"
#include "Utilities.cuh"

#define BLOCKSIZE_1D    256
#define BLOCKSIZE_2D_X  32
#define BLOCKSIZE_2D_Y  32

/*************************/
/* TEST KERNEL FUNCTIONS */
/*************************/
__global__ void test1(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {

    const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;

    if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::seq, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);

}

__global__ void test2(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {

    const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;

    if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::device, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);

}

/********/
/* MAIN */
/********/
int main() {

    const int Nrows = 64;
    const int Ncols = 2048;

    gpuErrchk(cudaFree(0));

//    size_t DevQueue;
//    gpuErrchk(cudaDeviceGetLimit(&DevQueue, cudaLimitDevRuntimePendingLaunchCount));
//    DevQueue *= 128;
//    gpuErrchk(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, DevQueue));

    float *h_data       = (float *)malloc(Nrows * Ncols * sizeof(float));
    float *h_results    = (float *)malloc(Nrows *         sizeof(float));
    float *h_results1   = (float *)malloc(Nrows *         sizeof(float));
    float *h_results2   = (float *)malloc(Nrows *         sizeof(float));
    float sum = 0.f;
    for (int i=0; i<Nrows; i++) {
        h_results[i] = 0.f;
        for (int j=0; j<Ncols; j++) {
            h_data[i*Ncols+j] = i;
            h_results[i] = h_results[i] + h_data[i*Ncols+j];
        }
    }

    TimingGPU timerGPU;

    float *d_data;          gpuErrchk(cudaMalloc((void**)&d_data,     Nrows * Ncols * sizeof(float)));
    float *d_results1;      gpuErrchk(cudaMalloc((void**)&d_results1, Nrows         * sizeof(float)));
    float *d_results2;      gpuErrchk(cudaMalloc((void**)&d_results2, Nrows         * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_data, h_data, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));

    timerGPU.StartCounter();
    test1<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing approach nr. 1 = %f\n", timerGPU.GetCounter());

    gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i=0; i<Nrows; i++) {
        if (h_results1[i] != h_results[i]) {
            printf("Approach nr. 1; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
            return 0;
        }
    }

    timerGPU.StartCounter();
    test2<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing approach nr. 2 = %f\n", timerGPU.GetCounter());

    gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i=0; i<Nrows; i++) {
        if (h_results1[i] != h_results[i]) {
            printf("Approach nr. 2; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
            return 0;
        }
    }

    printf("Test passed!\n");

}

The above example performs reductions 10 of the rows of a matrix in the same sense 9 as Reduce matrix rows with CUDA, but it is done differently from the 8 above post, namely, by calling CUDA Thrust 7 primitives directly from user written kernels. Also, the 6 above example serves to compare the performance 5 of the same operations when done with two 4 execution policies, namely, thrust::seq and thrust::device. Below, some 3 graphs showing the difference in performance.

Timings

Speedups

The 2 performance has been evaluated on a Kepler 1 K20c and on a Maxwell GeForce GTX 850M.

Score: 14

I would like to provide an updated answer 11 to this question.

Starting from Thrust 1.8, CUDA 10 Thrust primitives can be combined with the 9 thrust::seq execution policy to run sequentially within 8 a single CUDA thread (or sequentially within 7 a single CPU thread). Below, an example 6 is reported.

If you want parallel execution 5 within a thread, then you may consider using 4 CUB which provides reduction routines that 3 can be called from within a threadblock, provided 2 that your card enables dynamic parallelism.

Here 1 is the example with Thrust

#include <stdio.h>

#include <thrust/reduce.h>
#include <thrust/execution_policy.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void test(float *d_A, int N) {

    float sum = thrust::reduce(thrust::seq, d_A, d_A + N);

    printf("Device side result = %f\n", sum);

}

int main() {

    const int N = 16;

    float *h_A = (float*)malloc(N * sizeof(float));
    float sum = 0.f;
    for (int i=0; i<N; i++) {
        h_A[i] = i;
        sum = sum + h_A[i];
    }
    printf("Host side result = %f\n", sum);

    float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice));

    test<<<1,1>>>(d_A, N);

}
Score: 6

If you mean to use the data allocated / processed 6 by thrust yes you can, just get the raw 5 pointer of the allocated data.

int * raw_ptr = thrust::raw_pointer_cast(dev_ptr);

if you want 4 to allocate thrust vectors in the kernel 3 I never tried but I don't think will work and 2 also if it works I don't think it will provide 1 any benefit.

More Related questions