my question same question [asked here @ before][1]. no answer has been provided so, asking separate question.
i using cuda 7.0 toolkit on windows-7 os. using vs-2013.
i tried generate timeline of vector addition sample program , worked. when follow same steps generate timeline of own code then, keep showing message "running application generate timeline". know kernel gets called , working.
cudadevicereset() call there after finishing related cuda.
program: have changed original question provide minimal working example can produce same problem. following code not generating timeline using nvvp irrespective of place put cudadevicereset().
#include "cuda_runtime.h" #include "device_launch_parameters.h" //opencv #include <opencv2/highgui.hpp> #include <opencv2/core.hpp> #include <opencv2/imgproc.hpp> #include <stdio.h> using namespace cv; __global__ void colortransformation_kernel(int numchannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst) { // calculate our pixel's location int x = (blockidx.x * blockdim.x) + threadidx.x; int y = (blockidx.y * blockdim.y) + threadidx.y; // operate if in correct boundaries if (x >= 0 && x < iw && y >= 0 && y < ih) { ptr_dst[numchannels* (iw*y + x) + 0] = ptr_source[numchannels* (iw*y + x) + 0]; ptr_dst[numchannels* (iw*y + x) + 1] = ptr_source[numchannels* (iw*y + x) + 1]; ptr_dst[numchannels* (iw*y + x) + 2] = ptr_source[numchannels* (iw*y + x) + 2]; } } int main() { while (1) { mat image(400, 400, cv_8uc3, scalar(0, 0, 255)); unsigned char *h_src = image.data; size_t numbytes = image.rows * image.cols * 3; int numchannels = 3; unsigned char *dev_src, *dev_dst, *h_dst; //allocate memomry @ device source , destination , pointers cudamalloc((void**)&dev_src, numbytes * sizeof(unsigned char)); cudamalloc((void**)&dev_dst, numbytes * sizeof(unsigned char)); ////copy source image device i.e. gpu cudamemcpy(dev_src, h_src, numbytes * sizeof(unsigned char), cudamemcpyhosttodevice); ////kernel dim3 numofblocks(3 * (image.cols / 20), 3 * (image.rows / 20)); //multiplied 3 because have 3 channel image dim3 numofthreadsperblocks(20, 20); colortransformation_kernel << <numofblocks, numofthreadsperblocks >> >(numchannels, image.cols, image.rows, dev_src, dev_dst); cudadevicesynchronize(); //get processed image mat org_dijsdk_img(image.rows, image.cols, cv_8uc3); h_dst = org_dijsdk_img.data; cudamemcpy(h_dst, dev_dst, numbytes * sizeof(unsigned char), cudamemcpydevicetohost); //display processed image imshow("processed dijsdk image", org_dijsdk_img); waitkey(33); } cudadevicereset(); return 0; } very important clue: if comment line while(1) , hence run code once then, nvvp generates timeline. in original project, cannot timeline profile doing because, contain multi-threading , other stuff due which, there no image process during first run. so, must need way generate timeline code containing infinite while loop.
the problem in code endless while loop due cudaresetdevice() never being called. there 2 possible solutions deal such situations:
if interested have @ timeline profiling then, comment
while loop,nvvpable reachcudaresetdevice()present @ end ofmain().there might situation must keep loop inside program. example, in original project containing multi-threading, there no image process during initial 180 run of
while loop. deal such situations, replace while loopfor loopcan run limited number of times. example, folllowing code has helped me timeline profiling of 4 number of runs. posting modifiedmain().int main() { cudastream_t stream_one; cudastream_t stream_two; cudastream_t stream_three; //while (1) (int = 0; < 4; i++) { cudastreamcreate(&stream_one); cudastreamcreate(&stream_two); cudastreamcreate(&stream_three); mat image = imread("dijsdk_test_image.jpg", 1); //mat image(1080, 1920, cv_8uc3, scalar(0,0,255)); size_t numbytes = image.rows * image.cols * 3; int numchannels = 3; int iw = image.rows; int ih = image.cols; size_t totalmemsize = numbytes * sizeof(unsigned char); size_t onethirdmemsize = totalmemsize / 3; unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst; //allocate memomry @ device source , destination , pointers cudamalloc((void**)&dev_src_1, (totalmemsize) / 3); cudamalloc((void**)&dev_src_2, (totalmemsize) / 3); cudamalloc((void**)&dev_src_3, (totalmemsize) / 3); cudamalloc((void**)&dev_dst_1, (totalmemsize) / 3); cudamalloc((void**)&dev_dst_2, (totalmemsize) / 3); cudamalloc((void**)&dev_dst_3, (totalmemsize) / 3); //get processed image mat org_dijsdk_img(image.rows, image.cols, cv_8uc3, scalar(0, 0, 255)); h_dst = org_dijsdk_img.data; //copy new data of image host pointer h_src = image.data; //copy source image device i.e. gpu cudamemcpyasync(dev_src_1, h_src, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_one); cudamemcpyasync(dev_src_2, h_src + onethirdmemsize, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_two); cudamemcpyasync(dev_src_3, h_src + (2 * onethirdmemsize), (totalmemsize) / 3, cudamemcpyhosttodevice, stream_three); //kernel--stream-1 callmultistreamingcudakernel(dev_src_1, dev_dst_1, numchannels, iw, ih, &stream_one); //kernel--stream-2 callmultistreamingcudakernel(dev_src_2, dev_dst_2, numchannels, iw, ih, &stream_two); //kernel--stream-3 callmultistreamingcudakernel(dev_src_3, dev_dst_3, numchannels, iw, ih, &stream_three); //result copy: gpu cpu cudamemcpyasync(h_dst, dev_dst_1, (totalmemsize) / 3, cudamemcpydevicetohost, stream_one); cudamemcpyasync(h_dst + onethirdmemsize, dev_dst_2, (totalmemsize) / 3, cudamemcpydevicetohost, stream_two); cudamemcpyasync(h_dst + (2 * onethirdmemsize), dev_dst_3, (totalmemsize) / 3, cudamemcpydevicetohost, stream_three); // wait results cudastreamsynchronize(stream_one); cudastreamsynchronize(stream_two); cudastreamsynchronize(stream_three); //assign processed data display image. org_dijsdk_img.data = h_dst; //display processed image imshow("processed dijsdk image", org_dijsdk_img); waitkey(33); } cudadevicereset(); return 0; }
Comments
Post a Comment