i have loaded image (8 bit, unsigned char) of size 1080 x 1920. testing purposes, processing same image 4 times using for loop , then, generating timeline profiling.
strategy: dividing image 3 parts. have made 3 streams processing of whole image.
i providing minimal working example below. sorry need image using opencv don't know how can mimic same situation without loading image using opencv.
problem: timeline profiling shows first stream has finished transferring data still kernel assigned did not start. kernel assigned first stream , data transfer third stream parallel. so, question why processing of first stream's kernel did not start in parallel data transfer of second stream?
gpu: nvidia quadro k2000, compatible 3.0
timeline profile: each stream has been assigned different color.

my code:
__global__ void multistream_colortransformation_kernel(int numchannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst) { // calculate our pixel's location int x = (blockidx.x * blockdim.x) + threadidx.x; int y = (blockidx.y * blockdim.y) + threadidx.y; // operate if in correct boundaries if (x >= 0 && x < iw && y >= 0 && y < ih / 3) { ptr_dst[numchannels* (iw*y + x) + 0] = ptr_source[numchannels* (iw*y + x) + 0]; ptr_dst[numchannels* (iw*y + x) + 1] = ptr_source[numchannels* (iw*y + x) + 1]; ptr_dst[numchannels* (iw*y + x) + 2] = ptr_source[numchannels* (iw*y + x) + 2]; } } void callmultistreamingcudakernel(unsigned char *dev_src, unsigned char *dev_dst, int numchannels, int iw, int ih, cudastream_t *ptr_stream) { dim3 numofblocks((iw / 20), (ih / 20)); //don't multiply 3 because have 1/3 data of image dim3 numofthreadsperblocks(20, 20); multistream_colortransformation_kernel << <numofblocks, numofthreadsperblocks, 0, *ptr_stream >> >(numchannels, iw, ih, dev_src, dev_dst); return; } int main() { cudastream_t stream_one; cudastream_t stream_two; cudastream_t stream_three; cudastreamcreate(&stream_one); cudastreamcreate(&stream_two); cudastreamcreate(&stream_three); mat image = imread("dijsdk_test_image.jpg", 1); //mat image(1080, 1920, cv_8uc3, scalar(0,0,255)); size_t numbytes = image.rows * image.cols * 3; int numchannels = 3; int iw = image.rows; int ih = image.cols; size_t totalmemsize = numbytes * sizeof(unsigned char); size_t onethirdmemsize = totalmemsize / 3; unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst; //allocate memomry @ device source , destination , pointers cudamalloc((void**)&dev_src_1, (totalmemsize) / 3); cudamalloc((void**)&dev_src_2, (totalmemsize) / 3); cudamalloc((void**)&dev_src_3, (totalmemsize) / 3); cudamalloc((void**)&dev_dst_1, (totalmemsize) / 3); cudamalloc((void**)&dev_dst_2, (totalmemsize) / 3); cudamalloc((void**)&dev_dst_3, (totalmemsize) / 3); //get processed image mat org_dijsdk_img(image.rows, image.cols, cv_8uc3, scalar(0, 0, 255)); h_dst = org_dijsdk_img.data; //while (1) (int = 0; < 3; i++) { std::cout << "\nloop: " << i; //copy new data of image host pointer h_src = image.data; //copy source image device i.e. gpu cudamemcpyasync(dev_src_1, h_src, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_one); //kernel--stream-1 callmultistreamingcudakernel(dev_src_1, dev_dst_1, numchannels, iw, ih, &stream_one); //copy source image device i.e. gpu cudamemcpyasync(dev_src_2, h_src + onethirdmemsize, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_two); //kernel--stream-2 callmultistreamingcudakernel(dev_src_2, dev_dst_2, numchannels, iw, ih, &stream_two); //copy source image device i.e. gpu cudamemcpyasync(dev_src_3, h_src + (2 * onethirdmemsize), (totalmemsize) / 3, cudamemcpyhosttodevice, stream_three); //kernel--stream-3 callmultistreamingcudakernel(dev_src_3, dev_dst_3, numchannels, iw, ih, &stream_three); //result copy: gpu cpu cudamemcpyasync(h_dst, dev_dst_1, (totalmemsize) / 3, cudamemcpydevicetohost, stream_one); cudamemcpyasync(h_dst + onethirdmemsize, dev_dst_2, (totalmemsize) / 3, cudamemcpydevicetohost, stream_two); cudamemcpyasync(h_dst + (2 * onethirdmemsize), dev_dst_3, (totalmemsize) / 3, cudamemcpydevicetohost, stream_three); // wait results cudastreamsynchronize(stream_one); cudastreamsynchronize(stream_two); cudastreamsynchronize(stream_three); //assign processed data display image. org_dijsdk_img.data = h_dst; //display processed image imshow("processed dijsdk image", org_dijsdk_img); waitkey(33); } cudadevicereset(); return 0; } update-1: if remove kernel call of first stream then, second kernel , h2d copy of third stream somehow overlapped (not completely) shown below.

update-2 tried use 10 streams , things remain same. first stream's kernel processing began after h2d copy of tenth's stream data.

as commenters pointed out, host memory must page locked.
there no need allocate additional host memory through cudahostalloc, can use cudahostregister on existing opencv image:
cudahostregister(image.data, totalmemsize, cudahostregisterportable)
Comments
Post a Comment