c++ - Why not cudaMemcpyAsync(host to device) and CUDA kernel are parallel? -

i have loaded image (8 bit, unsigned char) of size 1080 x 1920. testing purposes, processing same image 4 times using for loop , then, generating timeline profiling.

strategy: dividing image 3 parts. have made 3 streams processing of whole image.

i providing minimal working example below. sorry need image using opencv don't know how can mimic same situation without loading image using opencv.

problem: timeline profiling shows first stream has finished transferring data still kernel assigned did not start. kernel assigned first stream , data transfer third stream parallel. so, question why processing of first stream's kernel did not start in parallel data transfer of second stream?

gpu: nvidia quadro k2000, compatible 3.0

timeline profile: each stream has been assigned different color.

my code:

__global__ void multistream_colortransformation_kernel(int numchannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst) {     // calculate our pixel's location     int x = (blockidx.x * blockdim.x) + threadidx.x;     int y = (blockidx.y * blockdim.y) + threadidx.y;      // operate if in correct boundaries     if (x >= 0 && x < iw && y >= 0 && y < ih / 3)     {         ptr_dst[numchannels*  (iw*y + x) + 0] = ptr_source[numchannels*  (iw*y + x) + 0];         ptr_dst[numchannels*  (iw*y + x) + 1] = ptr_source[numchannels*  (iw*y + x) + 1];         ptr_dst[numchannels*  (iw*y + x) + 2] = ptr_source[numchannels*  (iw*y + x) + 2];      } }  void callmultistreamingcudakernel(unsigned char *dev_src, unsigned char *dev_dst, int numchannels, int iw, int ih, cudastream_t *ptr_stream) {      dim3 numofblocks((iw / 20), (ih / 20)); //don't multiply 3 because have 1/3 data of image     dim3 numofthreadsperblocks(20, 20);     multistream_colortransformation_kernel << <numofblocks, numofthreadsperblocks, 0, *ptr_stream >> >(numchannels, iw, ih, dev_src, dev_dst);      return; }  int main() {      cudastream_t stream_one;     cudastream_t stream_two;     cudastream_t stream_three;      cudastreamcreate(&stream_one);     cudastreamcreate(&stream_two);     cudastreamcreate(&stream_three);      mat image = imread("dijsdk_test_image.jpg", 1);     //mat image(1080, 1920, cv_8uc3, scalar(0,0,255));     size_t numbytes = image.rows * image.cols * 3;     int numchannels = 3;      int iw = image.rows;     int ih = image.cols;     size_t totalmemsize = numbytes * sizeof(unsigned char);     size_t onethirdmemsize = totalmemsize / 3;      unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;       //allocate memomry @ device source , destination , pointers     cudamalloc((void**)&dev_src_1, (totalmemsize) / 3);     cudamalloc((void**)&dev_src_2, (totalmemsize) / 3);     cudamalloc((void**)&dev_src_3, (totalmemsize) / 3);     cudamalloc((void**)&dev_dst_1, (totalmemsize) / 3);     cudamalloc((void**)&dev_dst_2, (totalmemsize) / 3);     cudamalloc((void**)&dev_dst_3, (totalmemsize) / 3);      //get processed image      mat org_dijsdk_img(image.rows, image.cols, cv_8uc3, scalar(0, 0, 255));     h_dst = org_dijsdk_img.data;      //while (1)     (int = 0; < 3; i++)     {         std::cout << "\nloop: " << i;          //copy new data of image host pointer         h_src = image.data;          //copy source image device i.e. gpu         cudamemcpyasync(dev_src_1, h_src, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_one);         //kernel--stream-1         callmultistreamingcudakernel(dev_src_1, dev_dst_1, numchannels, iw, ih, &stream_one);           //copy source image device i.e. gpu         cudamemcpyasync(dev_src_2, h_src + onethirdmemsize, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_two);         //kernel--stream-2         callmultistreamingcudakernel(dev_src_2, dev_dst_2, numchannels, iw, ih, &stream_two);          //copy source image device i.e. gpu         cudamemcpyasync(dev_src_3, h_src + (2 * onethirdmemsize), (totalmemsize) / 3, cudamemcpyhosttodevice, stream_three);         //kernel--stream-3         callmultistreamingcudakernel(dev_src_3, dev_dst_3, numchannels, iw, ih, &stream_three);           //result copy: gpu cpu         cudamemcpyasync(h_dst, dev_dst_1, (totalmemsize) / 3, cudamemcpydevicetohost, stream_one);         cudamemcpyasync(h_dst + onethirdmemsize, dev_dst_2, (totalmemsize) / 3, cudamemcpydevicetohost, stream_two);         cudamemcpyasync(h_dst + (2 * onethirdmemsize), dev_dst_3, (totalmemsize) / 3, cudamemcpydevicetohost, stream_three);          // wait results          cudastreamsynchronize(stream_one);         cudastreamsynchronize(stream_two);         cudastreamsynchronize(stream_three);          //assign processed data display image.         org_dijsdk_img.data = h_dst;         //display processed image                    imshow("processed dijsdk image", org_dijsdk_img);         waitkey(33);     }      cudadevicereset();     return 0; }

update-1: if remove kernel call of first stream then, second kernel , h2d copy of third stream somehow overlapped (not completely) shown below.

update-2 tried use 10 streams , things remain same. first stream's kernel processing began after h2d copy of tenth's stream data.

as commenters pointed out, host memory must page locked.

there no need allocate additional host memory through cudahostalloc, can use cudahostregister on existing opencv image:

cudahostregister(image.data, totalmemsize, cudahostregisterportable)

WIKI

Search This Blog

c++ - Why not cudaMemcpyAsync(host to device) and CUDA kernel are parallel? -

Comments

Post a Comment