2015年10月20日 星期二

CUDA: Try To Measuring The Kernel By Timer

since: 2015/10/20
update: 2015/10/20

reference:
1. Professional CUDA C Programming
2. I touchs: Get Started On NVIDIA Jetson TK1
3. I touchs: How to re-flash your Jetson TK1 Development Kit 
4. I touchs: Using the Jetson TK1 as a remote development environment for CUDA


A. Timing With CPU Timer

     1. 在 Nsight Eclipse Edition 撰寫程式如下:
         // 檔名: sumArraysOnGPU-timer.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>


// check error

void CHECK(cudaError_t call)
{
    const cudaError_t error = call;
    if (error != cudaSuccess)
    {
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);
        fprintf(stderr, "code: %d, reason: %s\n", error,
                cudaGetErrorString(error));
    }
}

 
// the timer on cpu
double seconds()
{
    struct timeval tp;
    struct timezone tzp;
    int i = gettimeofday(&tp, &tzp);
    return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}


// verifying your kernel
void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            match = 0;
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
                   gpuRef[i], i);
            break;
        }
    }

    if (match) printf("Arrays match.\n\n");

    return;
}

 

// initial random data
void initialData(float *ip, int size)
{
    // generate different seed for random number
    time_t t;
    srand((unsigned) time(&t));

    for (int i = 0; i < size; i++)
    {
        ip[i] = (float)( rand() & 0xFF ) / 10.0f;
    }

    return;
}



// sum Arrays On Host

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx < N; idx++)
    {
        C[idx] = A[idx] + B[idx];
    }
}


// sum Arrays On GPU
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) C[i] = A[i] + B[i];
}


// the main function

int main(int argc, char **argv)
{
    printf("\n%s Starting...\n", argv[0]);

    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Using Device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));

    // set up data size of vectors
    int nElem = 1 << 24;
    printf("Vector size %d\n", nElem);

    // malloc host memory
    size_t nBytes = nElem * sizeof(float);

    float *h_A, *h_B, *hostRef, *gpuRef;
    h_A     = (float *)malloc(nBytes);
    h_B     = (float *)malloc(nBytes);
    hostRef = (float *)malloc(nBytes);
    gpuRef  = (float *)malloc(nBytes);

    double iStart, iElaps;

    // initialize data at host side
    iStart = seconds();
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    iElaps = seconds() - iStart;
    printf("initialData Time elapsed %f sec\n", iElaps);
    memset(hostRef, 0, nBytes);
    memset(gpuRef,  0, nBytes);

    // add vector at host side for result checks
    iStart = seconds();
    sumArraysOnHost(h_A, h_B, hostRef, nElem);
    iElaps = seconds() - iStart;
    printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);

    // malloc device global memory
    float *d_A, *d_B, *d_C;
    CHECK(cudaMalloc((float**)&d_A, nBytes));
    CHECK(cudaMalloc((float**)&d_B, nBytes));
    CHECK(cudaMalloc((float**)&d_C, nBytes));

    // transfer data from host to device
    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));

    // invoke kernel at host side
    int iLen = 256;
    dim3 block (iLen);
    dim3 grid  ((nElem + block.x - 1) / block.x);


    iStart = seconds();
    sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
    CHECK(cudaDeviceSynchronize()); // used just for debugging purpose
    iElaps = seconds() - iStart;
    printf("sumArraysOnGPU <<<  %d, %d  >>>  Time elapsed %f sec\n", grid.x,
           block.x, iElaps);


    // the testing result ....
    //

    // sumArraysOnGPU <<<  16384, 1024  >>>  Time elapsed 0.040817 sec
    // sumArraysOnGPU <<<  32768, 512  >>>  Time elapsed 0.031031 sec
    // sumArraysOnGPU <<<  65536, 256  >>>  Time elapsed 0.029070 sec
    // sumArraysOnGPU <<<  131072, 128  >>>  Time elapsed 0.028804 sec

    // sumArraysOnGPU <<<  262144, 64  >>>  Time elapsed 0.056073 sec
    // sumArraysOnGPU <<<  524288, 32  >>>  Time elapsed 0.098515 sec
    // sumArraysOnGPU <<<  1048576, 16  >>>  Time elapsed 0.190098 sec
    // sumArraysOnGPU <<<  2097152, 8  >>>  Time elapsed 0.375030 sec
    // sumArraysOnGPU <<<  4194304, 4  >>>  Time elapsed 0.803681 sec
    // sumArraysOnGPU <<<  8388608, 2  >>>  Time elapsed 1.835205 sec
    // sumArraysOnGPU <<<  16777216, 1  >>>  Time elapsed 3.515245 sec


    // check kernel error
    CHECK(cudaGetLastError()) ;

    // copy kernel result back to host side
    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));

    // check device results
    checkResult(hostRef, gpuRef, nElem);

    // free device global memory
    CHECK(cudaFree(d_A));
    CHECK(cudaFree(d_B));
    CHECK(cudaFree(d_C));

    // free host memory
    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    return(0);
}


     2. 執行結果:

-----------------------------------------------------------------------------------------------

B. Timing With nvprof(NVIDIA profiling tool)
     1. Before this, Don't forget to Add CUDA bin Environment Variables
         => How to re-flash your Jetson TK1 Development Kit
         => G. Add CUDA bin Environment Variables


     2. Login to TK1:
         $ ssh ubuntu@192.168.0.106

     3. Go to the release directory:
        $ cd /home/ubuntu/RD/projects/cuda/Release

     4. Timing with nvprof:
         $ nvprof ./NSightCUDA

     5. Result:
./NSightCUDA Starting...
==2996== NVPROF is profiling process 2996, command: ./NSightCUDA
Using Device 0: GK20A
Vector size 16777216
initialData Time elapsed 4.632509 sec
sumArraysOnHost Time elapsed 0.060414 sec
sumArraysOnGPU <<<  65536, 256  >>>  Time elapsed 0.023884 sec
Arrays match.

==2996== Profiling application: ./NSightCUDA
==2996== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 57.86%  137.14ms         3  45.714ms  44.908ms  46.267ms  [CUDA memcpy HtoD]
 32.42%  76.837ms         1  76.837ms  76.837ms  76.837ms  [CUDA memcpy DtoH]
  9.73%  23.053ms         1  23.053ms  23.053ms  23.053ms  sumArraysOnGPU(float*, float*, float*, int)

==2996== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 54.19%  219.10ms         4  54.774ms  45.602ms  79.567ms  cudaMemcpy
 39.03%  157.79ms         3  52.598ms  1.0795ms  152.74ms  cudaMalloc
  5.87%  23.717ms         1  23.717ms  23.717ms  23.717ms  cudaDeviceSynchronize
  0.64%  2.5713ms         3  857.08us  661.42us  1.0873ms  cudaFree
  0.14%  582.58us        83  7.0190us  1.4160us  302.67us  cuDeviceGetAttribute
  0.08%  331.83us         1  331.83us  331.83us  331.83us  cudaGetDeviceProperties
  0.03%  119.25us         1  119.25us  119.25us  119.25us  cudaLaunch
  0.01%  45.583us         1  45.583us  45.583us  45.583us  cudaSetDevice
  0.00%  17.416us         2  8.7080us  4.5000us  12.916us  cuDeviceGetCount
  0.00%  17.333us         1  17.333us  17.333us  17.333us  cudaGetLastError
  0.00%  10.667us         1  10.667us  10.667us  10.667us  cuDeviceTotalMem
  0.00%  9.0000us         1  9.0000us  9.0000us  9.0000us  cudaConfigureCall
  0.00%  6.2500us         1  6.2500us  6.2500us  6.2500us  cuDeviceGetName
  0.00%  4.8340us         4  1.2080us     667ns  1.8340us  cudaSetupArgument
  0.00%  4.1670us         2  2.0830us  1.8330us  2.3340us  cuDeviceGet


  備註: The nvprof result is more accurate than the host-side timing result

沒有留言:

張貼留言

注意:只有此網誌的成員可以留言。