gpgpu - determing the limit of size of the array when writing CUDA kernel for multi-gpu using Thrust library -


i trying write cuda kernel use multi-gpu , thrust library features. used tips previous posts.i tried write simple addition kernel. obvious intention use more complicated kernels.

my code follows:

#include "test.h" int main(int argc, char *argv[]) {           int num_gpus = 0;   // number of cuda gpus     // determine number of cuda capable gpus     cudagetdevicecount(&num_gpus);     printf("number of cuda devices:\t%d\n", num_gpus);      typedef thrust::device_vector<int> dvec;     typedef dvec *p_dvec;      // declaring vectors     std::vector<p_dvec> dvecs1;     std::vector<p_dvec> dvecs2;     std::vector<p_dvec> dvecs3;     std::vector<double>p(num_gpus);     dim3 dimgrid((dsize-1)/16.0 +1,1,1);     dim3 dimblock(16.0,1,1);      // initialize vectors     for(unsigned int = 0; < num_gpus; i++) {         cudasetdevice(i);         p_dvec temp1 = new dvec(dsize);         dvecs1.push_back(temp1);         thrust::fill((*(dvecs1[i])).begin(),(*(dvecs1[i])).end(),1.0);         p_dvec temp2 = new dvec(dsize);         dvecs2.push_back(temp2);         thrust::fill((*(dvecs2[i])).begin(),(*(dvecs2[i])).end(),2.0);     }    // launching kernel   for(unsigned int = 0; < num_gpus; i++) {       cudasetdevice(i);       p_dvec temp = new dvec(dsize);       dvecs3.push_back(temp);       fookernel<<<dimgrid,dimblock>>>(converttokernel(*dvecs1[i])),converttokernel(*(dvecs2[i])),converttokernel(*(dvecs3[i])));       // reduction operation       p[i]= thrust::reduce((*(dvecs3[i])).begin(),(*(dvecs3[i])).end(), (double) 0, thrust::plus<double>());       std::cout<<*((*(dvecs3[i])).begin())<<std::endl;       std::cout<<p[i]<<std::endl;   }    printf("success\n");   return 0;   } 

and header file follows:

#include <stdio.h>  #include <cstdio>  #include <stdlib.h> #include <cstdlib> #include <thrust/host_vector.h> #include <thrust/device_vector.h>    #include <thrust/copy.h> #include <thrust/reduce.h> #include <thrust/functional.h>  #define dsize 1048560                                    template < typename t >                                                         struct    kernelarray                                                              {                                                                                 t*  _array;                                                                   int _size;                                                                  };                                                   // function convert device_vector structure                               template < typename t >                                                         kernelarray< t > converttokernel( thrust::device_vector< t >& dvec )            {                                                                                   kernelarray< t > karray;                                                        karray._array = thrust::raw_pointer_cast( &dvec[0] );                           karray._size  = ( int ) dvec.size();                                             return karray;                                                              }                                                                                template< typename scalartype>                                                  __global__ void fookernel( kernelarray< scalartype > array1, kernelarray<scalartype>array2, kernelarray<scalartype> array3) {   size_t = blockidx.x * blockdim.x + threadidx.x;                               if(i< dsize)   array3._array[i] = array2._array[i] +array1._array[i];                         }                      

now if dsize> 1048560, result 0; have few questions:

1)how determine size limit of vector. have 8 devices.

2)is there way increase size of data can use or improve code?

3)when , need cudadevicesynchronize() ?

i happy if can me out.

if had used proper cuda error checking find out if , cuda errors occured, have gotten following output after launching fookernel dsize > 1048560:

invalid argument 

the reason error can have at 65535 blocks in 1 dimension ,

1048560/16 = 65535 

so did not run size limit of vector maximum block limit.


Comments