gpgpu - determing the limit of size of the array when writing CUDA kernel for multi-gpu using Thrust library -
i trying write cuda kernel use multi-gpu , thrust library features. used tips previous posts.i tried write simple addition kernel. obvious intention use more complicated kernels.
my code follows:
#include "test.h" int main(int argc, char *argv[]) { int num_gpus = 0; // number of cuda gpus // determine number of cuda capable gpus cudagetdevicecount(&num_gpus); printf("number of cuda devices:\t%d\n", num_gpus); typedef thrust::device_vector<int> dvec; typedef dvec *p_dvec; // declaring vectors std::vector<p_dvec> dvecs1; std::vector<p_dvec> dvecs2; std::vector<p_dvec> dvecs3; std::vector<double>p(num_gpus); dim3 dimgrid((dsize-1)/16.0 +1,1,1); dim3 dimblock(16.0,1,1); // initialize vectors for(unsigned int = 0; < num_gpus; i++) { cudasetdevice(i); p_dvec temp1 = new dvec(dsize); dvecs1.push_back(temp1); thrust::fill((*(dvecs1[i])).begin(),(*(dvecs1[i])).end(),1.0); p_dvec temp2 = new dvec(dsize); dvecs2.push_back(temp2); thrust::fill((*(dvecs2[i])).begin(),(*(dvecs2[i])).end(),2.0); } // launching kernel for(unsigned int = 0; < num_gpus; i++) { cudasetdevice(i); p_dvec temp = new dvec(dsize); dvecs3.push_back(temp); fookernel<<<dimgrid,dimblock>>>(converttokernel(*dvecs1[i])),converttokernel(*(dvecs2[i])),converttokernel(*(dvecs3[i]))); // reduction operation p[i]= thrust::reduce((*(dvecs3[i])).begin(),(*(dvecs3[i])).end(), (double) 0, thrust::plus<double>()); std::cout<<*((*(dvecs3[i])).begin())<<std::endl; std::cout<<p[i]<<std::endl; } printf("success\n"); return 0; } and header file follows:
#include <stdio.h> #include <cstdio> #include <stdlib.h> #include <cstdlib> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/copy.h> #include <thrust/reduce.h> #include <thrust/functional.h> #define dsize 1048560 template < typename t > struct kernelarray { t* _array; int _size; }; // function convert device_vector structure template < typename t > kernelarray< t > converttokernel( thrust::device_vector< t >& dvec ) { kernelarray< t > karray; karray._array = thrust::raw_pointer_cast( &dvec[0] ); karray._size = ( int ) dvec.size(); return karray; } template< typename scalartype> __global__ void fookernel( kernelarray< scalartype > array1, kernelarray<scalartype>array2, kernelarray<scalartype> array3) { size_t = blockidx.x * blockdim.x + threadidx.x; if(i< dsize) array3._array[i] = array2._array[i] +array1._array[i]; } now if dsize> 1048560, result 0; have few questions:
1)how determine size limit of vector. have 8 devices.
2)is there way increase size of data can use or improve code?
3)when , need cudadevicesynchronize() ?
i happy if can me out.
if had used proper cuda error checking find out if , cuda errors occured, have gotten following output after launching fookernel dsize > 1048560:
invalid argument the reason error can have at 65535 blocks in 1 dimension ,
1048560/16 = 65535 so did not run size limit of vector maximum block limit.
Comments
Post a Comment