cuda

Warp Execution

All threads of a warp are executed by the SIMD hardware as a bundle, where the same instruction is run for all threads.

Warp is the unit of thread scheduling in SMs.

when is it good?

When all threads within a warp follow the same control flow.

For example, for an if-else construct, the execution works well when either all threads execute the if part or all execute the else part.

when is it bad?

When threads within a warp take different control flow paths, the SIMD hardware will take multiple passes through these divergent paths. During each pass, the threads that follow the other path are not allowed to take effect.

These passes are sequential to each other, thus they will add to the execution time.

Solve divergence

Example, perform different operations on different columns

__global__ void matrixOperationRowWise(float* g_data, int width) {
    // Cast the pointer to float2*
    float2* g_data2 = (float2*)g_data;

    // Calculate row and column for float2 elements
    int row = blockIdx.y * blockDim.y + threadIdx.y; // blockIdx.y = {0,...,1023}, blockDim.y = 1
    int col = blockIdx.x * blockDim.x + threadIdx.x; // blockIdx.x = {0,1}, blockDim.x = 256

    // Calculate linear index for float2 elements
    int idx = row * (width / 2) + col; // width divides 2 due to g_data2 is array-like

    // Ensure we don't go out of bounds
    if (row < width && col < width / 2) {
        float2 val = g_data2[idx]; // row-wise saved matrix

        // Perform operations on each component of float2
        val.x = (2 * col % 2 == 0) ? evenOperation(val.x) : oddOperation(val.x); // Even or Odd column
        val.y = (2 * col % 2 == 0) ? oddOperation(val.y) : evenOperation(val.y); // Odd or Even column

        // Write back results
        g_data2[idx] = val;
    }
}

// Usage: matrixOperationRowWise<<<dim3(2, 1024), dim3(256, 1)>>>(g_data, 1024); // The matrix is 1024 * 1024, and the girdDim.y = 1024, 
// gridDim.x = 2, 

void computeCpuResults(float *g_data, int col, int row, int niterations,
                       int nreps) {
  for (int r = 0; r < nreps; r++) {
    printf("Rep: %d\n", r);
 // This directive is used to parallelize the loop using OpenMP, a parallel programming extension for C/C++. It means that the loop iterations (over iy) will be distributed across multiple CPU threads to execute in parallel. This can significantly speed up the computation, especially on multi-core CPUs.   
#pragma omp parallel for
    // This loop iterates over the rows of the 2D data array.
    for (int iy = 0; iy < row; ++iy) {
        // Nested within the row loop, this loop iterates over the columns of the 2D data array.
      for (int ix = 0; ix < col; ++ix) {
        // idx is calculated to linearly index the 2D array as if it were a 1D array.
        int idx = iy * col + ix;

        float value = g_data[idx];

        for (int i = 0; i < niterations; i++) {
          if (ix % 4 == 0) {
            value += sqrtf(logf(value) + 1.f);
          } else if (ix % 4 == 1) {
            value += sqrtf(cosf(value) + 1.f);
          } else if (ix % 4 == 2) {
            value += sqrtf(sinf(value) + 1.f);
          } else if (ix % 4 == 3) {
            value += sqrtf(tanf(value) + 1.f);
          }
        }
        g_data[idx] = value;
      }
    }
  }
}

__global__ void kernel_A(float *g_data, int dimx, int dimy, int niterations) {
    // dimx: columns, dimy:rows gridDim.y = num_sms, blockDim.y = 32, 
    
    for (int iy = blockIdx.y * blockDim.y + threadIdx.y; iy < dimy;
         iy += blockDim.y * gridDim.y) {
            // blcokDim.x = gridDim.x = 1
        for (int ix = blockIdx.x * blockDim.x + threadIdx.x; ix < dimx;
             ix += blockDim.x * gridDim.x) {
            // iy starts at 0,  ix starts at 0 until to columns-1, dimx = columns
            int idx = iy * dimx + ix;
            // access the first row's value 
            float value = g_data[idx];
            // niterations = 5
            for (int i = 0; i < niterations; i++) {
                // Perform different mathematical operations based on column index
            }
            g_data[idx] = value;
        }
    }
}



void launchKernel(float * d_data, int dimx, int dimy, int niterations) {
  // Only change the contents of this function and the kernel(s). You may
  // change the kernel's function signature as you see fit. 

  //query number of SMs 
  // Get Device Properties:
  // cudaDeviceProp is a structure that holds the properties of a CUDA device.
  
  cudaDeviceProp prop;
  //cudaGetDeviceProperties(&prop, 0) retrieves the properties of the CUDA device. The 0 here refers to the device ID, which is typically 0 for systems with a single GPU. This function populates the prop structure with information about the device.
  cudaGetDeviceProperties(&prop, 0);
    //   Number of Streaming Multiprocessors (SMs):
    // prop.multiProcessorCount fetches the number of streaming multiprocessors (SMs) on the GPU from the prop structure.
    // num_sms is set to this value. The number of SMs is a hardware characteristic of the GPU and varies depending on the specific GPU model. It's not a fixed value and will differ from one GPU to another.
  int num_sms = prop.multiProcessorCount;

  dim3 block(1, 32);        //
  dim3 grid(1, num_sms);   // 1 block, and num_sms on y axis
  kernel_A<<<grid, block>>>(d_data, dimx, dimy, niterations);
}

float timing_experiment(float *d_data,
                        int dimx, int dimy, int niterations, int nreps) {
  float elapsed_time_ms = 0.0f;
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start, 0);
  for (int i = 0; i < nreps; i++) {
    launchKernel(d_data, dimx, dimy, niterations);
  }
  cudaEventRecord(stop, 0);
  cudaDeviceSynchronize();
  cudaEventElapsedTime(&elapsed_time_ms, start, stop);
  elapsed_time_ms /= nreps;

  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  return elapsed_time_ms;
}

int main() {
  int dimx = 8 * 1024;
  int dimy = 8 * 1024;

  int nreps = 10;
  int niterations = 5;
   // Memory Size Calculation: Calculates the total number of bytes 
   // required to store the 2D array of floats.
  int nbytes = dimx * dimy * sizeof(float);

  float *d_data = 0, *h_data = 0, *h_gold = 0;
  // GPU Memory Allocation: Allocates memory on the GPU 
  // for d_data to store the 2D array.
  cudaMalloc((void **)&d_data, nbytes);
  if (0 == d_data) {
    printf("couldn't allocate GPU memory\n");
    return -1;
  }
  // Print GPU Memory Allocation: Prints the amount 
  // of memory allocated on the GPU.
  printf("allocated %.2f MB on GPU\n", nbytes / (1024.f * 1024.f));
  // CPU Memory Allocation: Allocates memory on the CPU 
  // for h_data (to store results) and h_gold (for initial data).
  h_data = (float *)malloc(nbytes);
  h_gold = (float *)malloc(nbytes);
  if (0 == h_data || 0 == h_gold) {
    printf("couldn't allocate CPU memory\n");
    return -2;
  }
  printf("allocated %.2f MB on CPU\n", 2.0f * nbytes / (1024.f * 1024.f));
  // Initialize h_gold Array: Initializes the h_gold array with random values.
  for (int i = 0; i < dimx * dimy; i++) h_gold[i] = 1.0f + 0.01*(float)rand()/(float)RAND_MAX;
  // Copy Data to GPU: Copies the initialized data from h_gold on the CPU to d_data on the GPU.
  // data in d_data on the GPU is also in a row-wise, linearized format
  cudaMemcpy(d_data, h_gold, nbytes, cudaMemcpyHostToDevice);
  // Run Timing Experiment (Warm-up): Runs a warm-up timing 
  // experiment for check purpose
  timing_experiment(d_data, dimx, dimy, niterations, 1);
  printf("Verifying solution\n");
  // Copy Data Back to CPU: Copies the processed data from the GPU back to the CPU.
  cudaMemcpy(h_data, d_data, nbytes, cudaMemcpyDeviceToHost);
  // Compute CPU Results for Verification: 
  //Computes the expected results on the CPU for verification.
  float rel_tol = .001;
  computeCpuResults(h_gold, dimx, dimy, niterations, 1);
  // Check Results: Compares the GPU results with the CPU results to verify correctness.
  bool pass = checkResults(h_gold, h_data, dimx, dimy, rel_tol);

  if (pass) {
    printf("Results are correct\n");
  } else {
    printf("FAIL:  results are incorrect\n");
  }  

  float elapsed_time_ms = 0.0f;
 
  elapsed_time_ms = timing_experiment(d_data, dimx, dimy, niterations,
                                      nreps);
  printf("A:  %8.2f ms\n", elapsed_time_ms);
  
  printf("CUDA: %s\n", cudaGetErrorString(cudaGetLastError()));

  if (d_data) cudaFree(d_data);
  if (h_data) free(h_data);

  cudaDeviceReset();

  return 0;
}

Method Explanation for improvement

There are potential control divergence issue as for the threads in a warp take different paths due to the if-else statement.
Here the matrix is row-major, so I cast the pointer to float4* so that each thread load the four matrix values. Do the computations, and write back. So I read and write global memory directly.
Processing in Kernel: When I perform operations inside the kernel, I am effectively working on the data pointed in float4 chunks. This can lead to more efficient processing due to reduced control divergence.

__global__ void kernel_A(float *g_data, int col, int row, int niterations) {
    // Cast the input pointer to float4*
    float4* A2 = (float4*)g_data;

    // Calculate row and column for float4 elements
    int rowid = blockIdx.y * blockDim.y + threadIdx.y;
    int colid = blockIdx.x * blockDim.x + threadIdx.x;

    // Calculate linear index for float4 elements
    int ix = rowid * (col / 4) + colid;

    // Ensure we don't go out of bounds
    if (rowid < row && colid < col / 4) {
        float4 value = A2[ix];
        // Perform operations on each component of float4
        for (int i = 0; i < niterations; i++) {
          int colidx = colid * 4; // Calculate the actual column index
          value.x += (colidx % 4 == 0) ? sqrtf(logf(value.x) + 1.f) : (colidx % 4 == 1) ? sqrtf(cosf(value.x) + 1.f) : (colidx % 4 == 2) ? sqrtf(sinf(value.x) + 1.f) : sqrtf(tanf(value.x) + 1.f);
          value.y += ((colidx + 1) % 4 == 0) ? sqrtf(logf(value.y) + 1.f) : ((colidx + 1) % 4 == 1) ? sqrtf(cosf(value.y) + 1.f) : ((colidx + 1) % 4 == 2) ? sqrtf(sinf(value.y) + 1.f) : sqrtf(tanf(value.y) + 1.f);
          value.z += ((colidx + 2) % 4 == 0) ? sqrtf(logf(value.z) + 1.f) : ((colidx + 2) % 4 == 1) ? sqrtf(cosf(value.z) + 1.f) : ((colidx + 2) % 4 == 2) ? sqrtf(sinf(value.z) + 1.f) : sqrtf(tanf(value.z) + 1.f);
          value.w += ((colidx + 3) % 4 == 0) ? sqrtf(logf(value.w) + 1.f) : ((colidx + 3) % 4 == 1) ? sqrtf(cosf(value.w) + 1.f) : ((colidx + 3) % 4 == 2) ? sqrtf(sinf(value.w) + 1.f) : sqrtf(tanf(value.w) + 1.f);
        }
        // Write back results
        A2[ix] = value;
    }
}

I have each block contains the maximum 1024 threads. Which means 1024 threads in the x-direction and 1 in the y-direction. The 1024 is a multiply of 32, making sure it is a multiply of the number of threads per warp.
I have a grid with 2 blocks along the x-axis and 8 * 1024 along the y-axis. This results in a total of 2 * 8 * 1024 blocks. So these 2 blocks will have 2048 threads, and using the 4 adjacent oeprations for each thread. This setup is suitable so along x-axis of the grid it processes 1 row of the matrix.

void launchKernel(float * d_data, int col, int row, int niterations) {
  // Only change the contents of this function and the kernel(s). You may
  // change the kernel's function signature as you see fit. 
  //query number of SMs
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, 0);
  dim3 block(1024, 1);
  dim3 grid(8, row);
  kernel_A<<<grid, block>>>(d_data, col, row, niterations);
}