update: 2015/10/20
reference:
1. Professional CUDA C Programming
2. I touchs: Get Started On NVIDIA Jetson TK1
3. I touchs: How to re-flash your Jetson TK1 Development Kit
4. I touchs: Using the Jetson TK1 as a remote development environment for CUDA
A. Timing With CPU Timer
1. 在 Nsight Eclipse Edition 撰寫程式如下:
// 檔名: sumArraysOnGPU-timer.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
// check error
void CHECK(cudaError_t call)
{
const cudaError_t error = call;
if (error != cudaSuccess)
{
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);
fprintf(stderr, "code: %d, reason: %s\n", error,
cudaGetErrorString(error));
}
}
// the timer on cpu
double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
// verifying your kernel
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
match = 0;
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
gpuRef[i], i);
break;
}
}
if (match) printf("Arrays match.\n\n");
return;
}
// initial random data
void initialData(float *ip, int size)
{
// generate different seed for random number
time_t t;
srand((unsigned) time(&t));
for (int i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
return;
}
// sum Arrays On Host
void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
for (int idx = 0; idx < N; idx++)
{
C[idx] = A[idx] + B[idx];
}
}
// sum Arrays On GPU
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i];
}
// the main function
int main(int argc, char **argv)
{
printf("\n%s Starting...\n", argv[0]);
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("Using Device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
// set up data size of vectors
int nElem = 1 << 24;
printf("Vector size %d\n", nElem);
// malloc host memory
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
double iStart, iElaps;
// initialize data at host side
iStart = seconds();
initialData(h_A, nElem);
initialData(h_B, nElem);
iElaps = seconds() - iStart;
printf("initialData Time elapsed %f sec\n", iElaps);
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// add vector at host side for result checks
iStart = seconds();
sumArraysOnHost(h_A, h_B, hostRef, nElem);
iElaps = seconds() - iStart;
printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);
// malloc device global memory
float *d_A, *d_B, *d_C;
CHECK(cudaMalloc((float**)&d_A, nBytes));
CHECK(cudaMalloc((float**)&d_B, nBytes));
CHECK(cudaMalloc((float**)&d_C, nBytes));
// transfer data from host to device
CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));
// invoke kernel at host side
int iLen = 256;
dim3 block (iLen);
dim3 grid ((nElem + block.x - 1) / block.x);
iStart = seconds();
sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
CHECK(cudaDeviceSynchronize()); // used just for debugging purpose
iElaps = seconds() - iStart;
printf("sumArraysOnGPU <<< %d, %d >>> Time elapsed %f sec\n", grid.x,
block.x, iElaps);
// the testing result ....
//
// sumArraysOnGPU <<< 16384, 1024 >>> Time elapsed 0.040817 sec
// sumArraysOnGPU <<< 32768, 512 >>> Time elapsed 0.031031 sec
// sumArraysOnGPU <<< 65536, 256 >>> Time elapsed 0.029070 sec
// sumArraysOnGPU <<< 131072, 128 >>> Time elapsed 0.028804 sec
// sumArraysOnGPU <<< 262144, 64 >>> Time elapsed 0.056073 sec
// sumArraysOnGPU <<< 524288, 32 >>> Time elapsed 0.098515 sec
// sumArraysOnGPU <<< 1048576, 16 >>> Time elapsed 0.190098 sec
// sumArraysOnGPU <<< 2097152, 8 >>> Time elapsed 0.375030 sec
// sumArraysOnGPU <<< 4194304, 4 >>> Time elapsed 0.803681 sec
// sumArraysOnGPU <<< 8388608, 2 >>> Time elapsed 1.835205 sec
// sumArraysOnGPU <<< 16777216, 1 >>> Time elapsed 3.515245 sec
// check kernel error
CHECK(cudaGetLastError()) ;
// copy kernel result back to host side
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
// check device results
checkResult(hostRef, gpuRef, nElem);
// free device global memory
CHECK(cudaFree(d_A));
CHECK(cudaFree(d_B));
CHECK(cudaFree(d_C));
// free host memory
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
return(0);
}
2. 執行結果:
-----------------------------------------------------------------------------------------------
B. Timing With nvprof(NVIDIA profiling tool)
1. Before this, Don't forget to Add CUDA bin Environment Variables
=> How to re-flash your Jetson TK1 Development Kit
=> G. Add CUDA bin Environment Variables
2. Login to TK1:
$ ssh ubuntu@192.168.0.106
3. Go to the release directory:
$ cd /home/ubuntu/RD/projects/cuda/Release
4. Timing with nvprof:
$ nvprof ./NSightCUDA
5. Result:
./NSightCUDA Starting...
==2996== NVPROF is profiling process 2996, command: ./NSightCUDA
Using Device 0: GK20A
Vector size 16777216
initialData Time elapsed 4.632509 sec
sumArraysOnHost Time elapsed 0.060414 sec
sumArraysOnGPU <<< 65536, 256 >>> Time elapsed 0.023884 sec
Arrays match.
==2996== Profiling application: ./NSightCUDA
==2996== Profiling result:
Time(%) Time Calls Avg Min Max Name
57.86% 137.14ms 3 45.714ms 44.908ms 46.267ms [CUDA memcpy HtoD]
32.42% 76.837ms 1 76.837ms 76.837ms 76.837ms [CUDA memcpy DtoH]
9.73% 23.053ms 1 23.053ms 23.053ms 23.053ms sumArraysOnGPU(float*, float*, float*, int)
==2996== API calls:
Time(%) Time Calls Avg Min Max Name
54.19% 219.10ms 4 54.774ms 45.602ms 79.567ms cudaMemcpy
39.03% 157.79ms 3 52.598ms 1.0795ms 152.74ms cudaMalloc
5.87% 23.717ms 1 23.717ms 23.717ms 23.717ms cudaDeviceSynchronize
0.64% 2.5713ms 3 857.08us 661.42us 1.0873ms cudaFree
0.14% 582.58us 83 7.0190us 1.4160us 302.67us cuDeviceGetAttribute
0.08% 331.83us 1 331.83us 331.83us 331.83us cudaGetDeviceProperties
0.03% 119.25us 1 119.25us 119.25us 119.25us cudaLaunch
0.01% 45.583us 1 45.583us 45.583us 45.583us cudaSetDevice
0.00% 17.416us 2 8.7080us 4.5000us 12.916us cuDeviceGetCount
0.00% 17.333us 1 17.333us 17.333us 17.333us cudaGetLastError
0.00% 10.667us 1 10.667us 10.667us 10.667us cuDeviceTotalMem
0.00% 9.0000us 1 9.0000us 9.0000us 9.0000us cudaConfigureCall
0.00% 6.2500us 1 6.2500us 6.2500us 6.2500us cuDeviceGetName
0.00% 4.8340us 4 1.2080us 667ns 1.8340us cudaSetupArgument
0.00% 4.1670us 2 2.0830us 1.8330us 2.3340us cuDeviceGet
備註: The nvprof result is more accurate than the host-side timing result
沒有留言:
張貼留言
注意:只有此網誌的成員可以留言。