c++ - Why not cudaMemcpyAsync(host to device) and CUDA kernel are parallel? -


i have loaded image (8 bit, unsigned char) of size 1080 x 1920. testing purposes, processing same image 4 times using for loop , then, generating timeline profiling.

strategy: dividing image 3 parts. have made 3 streams processing of whole image.

i providing minimal working example below. sorry need image using opencv don't know how can mimic same situation without loading image using opencv.

problem: timeline profiling shows first stream has finished transferring data still kernel assigned did not start. kernel assigned first stream , data transfer third stream parallel. so, question why processing of first stream's kernel did not start in parallel data transfer of second stream?

gpu: nvidia quadro k2000, compatible 3.0

timeline profile: each stream has been assigned different color.

image

my code:

__global__ void multistream_colortransformation_kernel(int numchannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst) {     // calculate our pixel's location     int x = (blockidx.x * blockdim.x) + threadidx.x;     int y = (blockidx.y * blockdim.y) + threadidx.y;      // operate if in correct boundaries     if (x >= 0 && x < iw && y >= 0 && y < ih / 3)     {         ptr_dst[numchannels*  (iw*y + x) + 0] = ptr_source[numchannels*  (iw*y + x) + 0];         ptr_dst[numchannels*  (iw*y + x) + 1] = ptr_source[numchannels*  (iw*y + x) + 1];         ptr_dst[numchannels*  (iw*y + x) + 2] = ptr_source[numchannels*  (iw*y + x) + 2];      } }  void callmultistreamingcudakernel(unsigned char *dev_src, unsigned char *dev_dst, int numchannels, int iw, int ih, cudastream_t *ptr_stream) {      dim3 numofblocks((iw / 20), (ih / 20)); //don't multiply 3 because have 1/3 data of image     dim3 numofthreadsperblocks(20, 20);     multistream_colortransformation_kernel << <numofblocks, numofthreadsperblocks, 0, *ptr_stream >> >(numchannels, iw, ih, dev_src, dev_dst);      return; }  int main() {      cudastream_t stream_one;     cudastream_t stream_two;     cudastream_t stream_three;      cudastreamcreate(&stream_one);     cudastreamcreate(&stream_two);     cudastreamcreate(&stream_three);      mat image = imread("dijsdk_test_image.jpg", 1);     //mat image(1080, 1920, cv_8uc3, scalar(0,0,255));     size_t numbytes = image.rows * image.cols * 3;     int numchannels = 3;      int iw = image.rows;     int ih = image.cols;     size_t totalmemsize = numbytes * sizeof(unsigned char);     size_t onethirdmemsize = totalmemsize / 3;      unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;       //allocate memomry @ device source , destination , pointers     cudamalloc((void**)&dev_src_1, (totalmemsize) / 3);     cudamalloc((void**)&dev_src_2, (totalmemsize) / 3);     cudamalloc((void**)&dev_src_3, (totalmemsize) / 3);     cudamalloc((void**)&dev_dst_1, (totalmemsize) / 3);     cudamalloc((void**)&dev_dst_2, (totalmemsize) / 3);     cudamalloc((void**)&dev_dst_3, (totalmemsize) / 3);      //get processed image      mat org_dijsdk_img(image.rows, image.cols, cv_8uc3, scalar(0, 0, 255));     h_dst = org_dijsdk_img.data;      //while (1)     (int = 0; < 3; i++)     {         std::cout << "\nloop: " << i;          //copy new data of image host pointer         h_src = image.data;          //copy source image device i.e. gpu         cudamemcpyasync(dev_src_1, h_src, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_one);         //kernel--stream-1         callmultistreamingcudakernel(dev_src_1, dev_dst_1, numchannels, iw, ih, &stream_one);           //copy source image device i.e. gpu         cudamemcpyasync(dev_src_2, h_src + onethirdmemsize, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_two);         //kernel--stream-2         callmultistreamingcudakernel(dev_src_2, dev_dst_2, numchannels, iw, ih, &stream_two);          //copy source image device i.e. gpu         cudamemcpyasync(dev_src_3, h_src + (2 * onethirdmemsize), (totalmemsize) / 3, cudamemcpyhosttodevice, stream_three);         //kernel--stream-3         callmultistreamingcudakernel(dev_src_3, dev_dst_3, numchannels, iw, ih, &stream_three);           //result copy: gpu cpu         cudamemcpyasync(h_dst, dev_dst_1, (totalmemsize) / 3, cudamemcpydevicetohost, stream_one);         cudamemcpyasync(h_dst + onethirdmemsize, dev_dst_2, (totalmemsize) / 3, cudamemcpydevicetohost, stream_two);         cudamemcpyasync(h_dst + (2 * onethirdmemsize), dev_dst_3, (totalmemsize) / 3, cudamemcpydevicetohost, stream_three);          // wait results          cudastreamsynchronize(stream_one);         cudastreamsynchronize(stream_two);         cudastreamsynchronize(stream_three);          //assign processed data display image.         org_dijsdk_img.data = h_dst;         //display processed image                    imshow("processed dijsdk image", org_dijsdk_img);         waitkey(33);     }      cudadevicereset();     return 0; } 

update-1: if remove kernel call of first stream then, second kernel , h2d copy of third stream somehow overlapped (not completely) shown below.

image2

update-2 tried use 10 streams , things remain same. first stream's kernel processing began after h2d copy of tenth's stream data.

image-3

as commenters pointed out, host memory must page locked.

there no need allocate additional host memory through cudahostalloc, can use cudahostregister on existing opencv image:

cudahostregister(image.data, totalmemsize, cudahostregisterportable) 

Comments