For testing performance of our Nvidia GPU, I have to write my first CUDA program to mutiply two Vectors with each size of 2GB:
#include
#include
#include
#include
size_t LOOP = 10;
const size_t COLUMNS = 512 * 1048576;
const size_t BUFF_LEN = 4 * COLUMNS;
__global__ void VecMul(float *A, float *B, float *C, float *total) {
int i = threadIdx.x;
C[i] = A[i] * B[i];
}
float test_cuda(float *left, float *right, float *result, size_t count) {
float total;
float *left_d, *right_d, *result_d;
struct timeval before, after, c_before, c_after;
int i, error;
error = cudaMalloc((void**) &left_d, BUFF_LEN);
if (error != cudaSuccess) {
printf("Failed to malloc left_d!\n");
exit(1);
}
error = cudaMalloc((void**) &right_d, BUFF_LEN);
if (error != cudaSuccess) {
printf("Failed to malloc right_d!\n");
exit(1);
}
error = cudaMalloc((void**) &result_d, BUFF_LEN);
if (error != cudaSuccess) {
printf("Failed to malloc result_d!\n");
exit(1);
}
gettimeofday(&before, NULL);
cudaMemcpy(left_d, left, BUFF_LEN, cudaMemcpyHostToDevice);
cudaMemcpy(right_d, right, BUFF_LEN, cudaMemcpyHostToDevice);
gettimeofday(&c_before, NULL);
for (i = 0; i < LOOP; i++) {
VecMul<<<1, COLUMNS>>>(left_d, right_d, result_d, &total);
}
gettimeofday(&c_after, NULL);
cudaMemcpy(result, result_d, BUFF_LEN, cudaMemcpyDeviceToHost);
gettimeofday(&after, NULL);
printf("CUDA compute:\t%lu\n", c_after.tv_usec + c_after.tv_sec * 1000000 -
(c_before.tv_usec + c_before.tv_sec * 1000000));
printf("CUDA:\t%lu\n", after.tv_usec + after.tv_sec * 1000000 -
(before.tv_usec + before.tv_sec * 1000000));
for (i = 0; i < 4; i++) {
printf("[Sample: %f]\n", result[i]);
}
for (i = COLUMNS - 4; i < COLUMNS; i++) {
printf("[Sample: %f]\n", result[i]);
}
cudaFree(left_d);
cudaFree(right_d);
cudaFree(result_d);
return total;
}
int main(int argc, char *argv[]) {
float *left = (float*)_mm_malloc(BUFF_LEN, 32);
float *right = (float*)_mm_malloc(BUFF_LEN, 32);
float *result = (float*)_mm_malloc(BUFF_LEN, 32);
size_t count = BUFF_LEN / sizeof(float);
int i;
if (argc > 1) {
LOOP = atol(argv[1]);
}
for (i = 0; i < count; i++) {
left[i] = 1.23456;
right[i] = 1.23456;
result[i] = 1.23456;
}
test_cuda(left, right, result, count);
free(left);
free(right);
}
Luckily, it works 🙂
The cudaMemcpy() cost about 1 second, but the multiplication of two Vectors cost only 80 micro seconds (even with 10 LOOP as default). Therefore I reckon GPU is perfect for training of Machine Learning, but not promising for predicting when Model has been built.
Note: Use cudaMalloc()/cudaMemcpy() instead of malloc()/memcpy() in Standard C Library, or else the program will not run VecMul<<<>>>