CUDA

Trace memory error of CUDA program

The program which used CUDA for computing in GPU reported error about memory:

terminate called after throwing an instance of 'std::runtime_error'
  what():  [CUDA] an illegal memory access was encountered LightGBM/src/treelearner/cuda_tree_learner.cpp 239

For common C++ program, we use gdb for debugging. For CUDA program, we should use cuda-gdb. Make sure to compile CUDA code with -g flag and then run:

/usr/local/cuda-11.0/bin/cuda-gdb python3
(cuda-gdb) run test.py

After a while, we could see the exact memory corrupt position of the code:

CUDA Exception: Warp Illegal Address
The exception was triggered at PC 0x1668b2f0 (histogram_16_64_256.cu:182)
Thread 1 "python3" received signal CUDA_EXCEPTION_14, Warp Illegal Address.
[Switching focus to CUDA kernel 0, grid 10, block (2163,0,0), thread (0,0,0), device 0, sm 0, warp 3, lane 0]
0x000000001668b380 in LightGBM::histogram16<<<(7360,1,1),(16,1,1)>>> () at LightGBM/src/treelearner/kernels/histogram_16_64_256.cu:185
185            feature = (feature >> ((ind & 1) << 2)) & 0xf;

Debug CUDA error for PyTorch

After I changed my dataset for my code, the training failed:

/tmp/pip-req-build-_tx3iysr/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:310: operator(): block: [0,0,0], thread: [59,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/tmp/pip-req-build-_tx3iysr/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:310: operator(): block: [0,0,0], thread: [60,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/tmp/pip-req-build-_tx3iysr/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:310: operator(): block: [0,0,0], thread: [61,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/tmp/pip-req-build-_tx3iysr/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:310: operator(): block: [0,0,0], thread: [62,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/tmp/pip-req-build-_tx3iysr/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:310: operator(): block: [0,0,0], thread: [63,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
Traceback (most recent call last):
  File "train.py", line 337, in <module>
    train(args, train_loader, eval_loader)
  File "train.py", line 189, in train
    sounds = aug(sounds)
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 881, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/sanbai/birds_sound_classification/utils/augment.py", line 13, in forward
    image = (image - image.mean()) / image.std()
RuntimeError: CUDA error: device-side assert triggered

It’s terribly hard to find out the reason for this common error “RuntimeError: CUDA error: device-side assert triggered”. But someone on Github recommends a method: adding CUDA_LAUNCH_BLOCKING=1 before the program.

Now the real error behind RuntimeError shows up: it’s the wrong number of categories I set to the model.

A CUDA program to test performance of GPU

For testing performance of our Nvidia GPU, I have to write my first CUDA program to mutiply two Vectors with each size of 2GB:

#include 
#include 
#include 
#include 
size_t LOOP = 10;
const size_t COLUMNS = 512 * 1048576;
const size_t BUFF_LEN = 4 * COLUMNS;
__global__ void VecMul(float *A, float *B, float *C, float *total) {
  int i = threadIdx.x;
  C[i] = A[i] * B[i];
}
float test_cuda(float *left, float *right, float *result, size_t count) {
  float total;
  float *left_d, *right_d, *result_d;
  struct timeval before, after, c_before, c_after;
  int i, error;
  error = cudaMalloc((void**) &left_d, BUFF_LEN);
  if (error != cudaSuccess) {
    printf("Failed to malloc left_d!\n");
    exit(1);
  }
  error = cudaMalloc((void**) &right_d, BUFF_LEN);
  if (error != cudaSuccess) {
    printf("Failed to malloc right_d!\n");
    exit(1);
  }
  error = cudaMalloc((void**) &result_d, BUFF_LEN);
  if (error != cudaSuccess) {
    printf("Failed to malloc result_d!\n");
    exit(1);
  }
  gettimeofday(&before, NULL);
  cudaMemcpy(left_d, left, BUFF_LEN, cudaMemcpyHostToDevice);
  cudaMemcpy(right_d, right, BUFF_LEN, cudaMemcpyHostToDevice);
  gettimeofday(&c_before, NULL);
  for (i = 0; i < LOOP; i++) {
    VecMul<<<1, COLUMNS>>>(left_d, right_d, result_d, &total);
  }
  gettimeofday(&c_after, NULL);
  cudaMemcpy(result, result_d, BUFF_LEN, cudaMemcpyDeviceToHost);
  gettimeofday(&after, NULL);
  printf("CUDA compute:\t%lu\n", c_after.tv_usec + c_after.tv_sec * 1000000 -
         (c_before.tv_usec + c_before.tv_sec * 1000000));
  printf("CUDA:\t%lu\n", after.tv_usec + after.tv_sec * 1000000 -
         (before.tv_usec + before.tv_sec * 1000000));
  for (i = 0; i < 4; i++) {
    printf("[Sample: %f]\n", result[i]);
  }
  for (i = COLUMNS - 4; i < COLUMNS; i++) {
    printf("[Sample: %f]\n", result[i]);
  }
  cudaFree(left_d);
  cudaFree(right_d);
  cudaFree(result_d);
  return total;
}
int main(int argc, char *argv[]) {
  float *left = (float*)_mm_malloc(BUFF_LEN, 32);
  float *right = (float*)_mm_malloc(BUFF_LEN, 32);
  float *result = (float*)_mm_malloc(BUFF_LEN, 32);
  size_t count = BUFF_LEN / sizeof(float);
  int i;
  if (argc > 1) {
      LOOP = atol(argv[1]);
  }
  for (i = 0; i < count; i++) {
    left[i] = 1.23456;
    right[i] = 1.23456;
    result[i] = 1.23456;
  }
  test_cuda(left, right, result, count);
  free(left);
  free(right);
}

Luckily, it works :)
The cudaMemcpy() cost about 1 second, but the multiplication of two Vectors cost only 80 micro seconds (even with 10 LOOP as default). Therefore I reckon GPU is perfect for training of Machine Learning, but not promising for predicting when Model has been built.

Note: Use cudaMalloc()/cudaMemcpy() instead of malloc()/memcpy() in Standard C Library, or else the program will not run VecMul<<<>>>

Robin on Linux

CUDA