c++ - NVIDIA Visual profiler does not generate a timeline -


my question same question [asked here @ before][1]. no answer has been provided so, asking separate question.

i using cuda 7.0 toolkit on windows-7 os. using vs-2013.

i tried generate timeline of vector addition sample program , worked. when follow same steps generate timeline of own code then, keep showing message "running application generate timeline". know kernel gets called , working.

cudadevicereset() call there after finishing related cuda.

program: have changed original question provide minimal working example can produce same problem. following code not generating timeline using nvvp irrespective of place put cudadevicereset().

#include "cuda_runtime.h" #include "device_launch_parameters.h"  //opencv #include <opencv2/highgui.hpp> #include <opencv2/core.hpp> #include <opencv2/imgproc.hpp>  #include <stdio.h>  using namespace cv;  __global__ void colortransformation_kernel(int numchannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst) {     // calculate our pixel's location     int x = (blockidx.x * blockdim.x) + threadidx.x;     int y = (blockidx.y * blockdim.y) + threadidx.y;      // operate if in correct boundaries     if (x >= 0 && x < iw && y >= 0 && y < ih)     {            ptr_dst[numchannels*  (iw*y + x) + 0] = ptr_source[numchannels*  (iw*y + x) + 0];         ptr_dst[numchannels*  (iw*y + x) + 1] = ptr_source[numchannels*  (iw*y + x) + 1];         ptr_dst[numchannels*  (iw*y + x) + 2] = ptr_source[numchannels*  (iw*y + x) + 2];     } }  int main() {     while (1)     {          mat image(400, 400, cv_8uc3, scalar(0, 0, 255));         unsigned char *h_src = image.data;         size_t numbytes = image.rows * image.cols * 3;         int numchannels = 3;           unsigned char *dev_src, *dev_dst, *h_dst;          //allocate memomry @ device source , destination , pointers         cudamalloc((void**)&dev_src, numbytes * sizeof(unsigned char));         cudamalloc((void**)&dev_dst, numbytes * sizeof(unsigned char));          ////copy source image device i.e. gpu         cudamemcpy(dev_src, h_src, numbytes * sizeof(unsigned char), cudamemcpyhosttodevice);          ////kernel         dim3 numofblocks(3 * (image.cols / 20), 3 * (image.rows / 20)); //multiplied 3 because have 3 channel image         dim3 numofthreadsperblocks(20, 20);         colortransformation_kernel << <numofblocks, numofthreadsperblocks >> >(numchannels, image.cols, image.rows, dev_src, dev_dst);         cudadevicesynchronize();          //get processed image          mat org_dijsdk_img(image.rows, image.cols, cv_8uc3);         h_dst = org_dijsdk_img.data;         cudamemcpy(h_dst, dev_dst, numbytes * sizeof(unsigned char), cudamemcpydevicetohost);          //display processed image                    imshow("processed dijsdk image", org_dijsdk_img);         waitkey(33);      }      cudadevicereset();     return 0; } 

very important clue: if comment line while(1) , hence run code once then, nvvp generates timeline. in original project, cannot timeline profile doing because, contain multi-threading , other stuff due which, there no image process during first run. so, must need way generate timeline code containing infinite while loop.

the problem in code endless while loop due cudaresetdevice() never being called. there 2 possible solutions deal such situations:

  1. if interested have @ timeline profiling then, comment while loop , nvvp able reach cudaresetdevice() present @ end of main().

  2. there might situation must keep loop inside program. example, in original project containing multi-threading, there no image process during initial 180 run of while loop. deal such situations, replace while loop for loop can run limited number of times. example, folllowing code has helped me timeline profiling of 4 number of runs. posting modified main().

    int main() { cudastream_t stream_one; cudastream_t stream_two; cudastream_t stream_three;  //while (1) (int = 0; < 4; i++) {     cudastreamcreate(&stream_one);     cudastreamcreate(&stream_two);     cudastreamcreate(&stream_three);      mat image = imread("dijsdk_test_image.jpg", 1);     //mat image(1080, 1920, cv_8uc3, scalar(0,0,255));     size_t numbytes = image.rows * image.cols * 3;     int numchannels = 3;      int iw = image.rows;     int ih = image.cols;     size_t totalmemsize = numbytes * sizeof(unsigned char);     size_t onethirdmemsize = totalmemsize / 3;      unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;       //allocate memomry @ device source , destination , pointers     cudamalloc((void**)&dev_src_1, (totalmemsize) / 3);     cudamalloc((void**)&dev_src_2, (totalmemsize) / 3);     cudamalloc((void**)&dev_src_3, (totalmemsize) / 3);     cudamalloc((void**)&dev_dst_1, (totalmemsize) / 3);     cudamalloc((void**)&dev_dst_2, (totalmemsize) / 3);     cudamalloc((void**)&dev_dst_3, (totalmemsize) / 3);      //get processed image      mat org_dijsdk_img(image.rows, image.cols, cv_8uc3, scalar(0, 0, 255));     h_dst = org_dijsdk_img.data;     //copy new data of image host pointer     h_src = image.data;      //copy source image device i.e. gpu     cudamemcpyasync(dev_src_1, h_src, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_one);     cudamemcpyasync(dev_src_2, h_src + onethirdmemsize, (totalmemsize) / 3, cudamemcpyhosttodevice, stream_two);     cudamemcpyasync(dev_src_3, h_src + (2 * onethirdmemsize), (totalmemsize) / 3, cudamemcpyhosttodevice, stream_three);      //kernel--stream-1     callmultistreamingcudakernel(dev_src_1, dev_dst_1, numchannels, iw, ih, &stream_one);     //kernel--stream-2     callmultistreamingcudakernel(dev_src_2, dev_dst_2, numchannels, iw, ih, &stream_two);     //kernel--stream-3     callmultistreamingcudakernel(dev_src_3, dev_dst_3, numchannels, iw, ih, &stream_three);       //result copy: gpu cpu     cudamemcpyasync(h_dst, dev_dst_1, (totalmemsize) / 3, cudamemcpydevicetohost, stream_one);     cudamemcpyasync(h_dst + onethirdmemsize, dev_dst_2, (totalmemsize) / 3, cudamemcpydevicetohost, stream_two);     cudamemcpyasync(h_dst + (2 * onethirdmemsize), dev_dst_3, (totalmemsize) / 3, cudamemcpydevicetohost, stream_three);      // wait results      cudastreamsynchronize(stream_one);     cudastreamsynchronize(stream_two);     cudastreamsynchronize(stream_three);       //assign processed data display image.     org_dijsdk_img.data = h_dst;     //display processed image                imshow("processed dijsdk image", org_dijsdk_img);     waitkey(33);   }  cudadevicereset(); return 0;    } 

Comments