For testing performance of our Nvidia GPU, I have to write my first CUDA program to mutiply two Vectors with each size of 2GB:

#include 
#include 
#include 
#include 
size_t LOOP = 10;
const size_t COLUMNS = 512 * 1048576;
const size_t BUFF_LEN = 4 * COLUMNS;
__global__ void VecMul(float *A, float *B, float *C, float *total) {
  int i = threadIdx.x;
  C[i] = A[i] * B[i];
}
float test_cuda(float *left, float *right, float *result, size_t count) {
  float total;
  float *left_d, *right_d, *result_d;
  struct timeval before, after, c_before, c_after;
  int i, error;
  error = cudaMalloc((void**) &left_d, BUFF_LEN);
  if (error != cudaSuccess) {
    printf("Failed to malloc left_d!\n");
    exit(1);
  }
  error = cudaMalloc((void**) &right_d, BUFF_LEN);
  if (error != cudaSuccess) {
    printf("Failed to malloc right_d!\n");
    exit(1);
  }
  error = cudaMalloc((void**) &result_d, BUFF_LEN);
  if (error != cudaSuccess) {
    printf("Failed to malloc result_d!\n");
    exit(1);
  }
  gettimeofday(&before, NULL);
  cudaMemcpy(left_d, left, BUFF_LEN, cudaMemcpyHostToDevice);
  cudaMemcpy(right_d, right, BUFF_LEN, cudaMemcpyHostToDevice);
  gettimeofday(&c_before, NULL);
  for (i = 0; i < LOOP; i++) {
    VecMul<<<1, COLUMNS>>>(left_d, right_d, result_d, &total);
  }
  gettimeofday(&c_after, NULL);
  cudaMemcpy(result, result_d, BUFF_LEN, cudaMemcpyDeviceToHost);
  gettimeofday(&after, NULL);
  printf("CUDA compute:\t%lu\n", c_after.tv_usec + c_after.tv_sec * 1000000 -
         (c_before.tv_usec + c_before.tv_sec * 1000000));
  printf("CUDA:\t%lu\n", after.tv_usec + after.tv_sec * 1000000 -
         (before.tv_usec + before.tv_sec * 1000000));
  for (i = 0; i < 4; i++) {
    printf("[Sample: %f]\n", result[i]);
  }
  for (i = COLUMNS - 4; i < COLUMNS; i++) {
    printf("[Sample: %f]\n", result[i]);
  }
  cudaFree(left_d);
  cudaFree(right_d);
  cudaFree(result_d);
  return total;
}
int main(int argc, char *argv[]) {
  float *left = (float*)_mm_malloc(BUFF_LEN, 32);
  float *right = (float*)_mm_malloc(BUFF_LEN, 32);
  float *result = (float*)_mm_malloc(BUFF_LEN, 32);
  size_t count = BUFF_LEN / sizeof(float);
  int i;
  if (argc > 1) {
      LOOP = atol(argv[1]);
  }
  for (i = 0; i < count; i++) {
    left[i] = 1.23456;
    right[i] = 1.23456;
    result[i] = 1.23456;
  }
  test_cuda(left, right, result, count);
  free(left);
  free(right);
}

Luckily, it works 🙂
The cudaMemcpy() cost about 1 second, but the multiplication of two Vectors cost only 80 micro seconds (even with 10 LOOP as default). Therefore I reckon GPU is perfect for training of Machine Learning, but not promising for predicting when Model has been built.

Note: Use cudaMalloc()/cudaMemcpy() instead of malloc()/memcpy() in Standard C Library, or else the program will not run VecMul<<<>>>