# Question:

GPU calculation with zero copy:

consider cudaEventRecord() and cudaEventSynchronize()

Print out the time for all GPU calculations

# Matrix multiplication Answer and Explanation

3. Define the CUDA Kernel: Write a kernel function that performs matrix multiplication. Each thread block will compute a portion of matrix C.

4. Invoke the Kernel: Launch the kernel with appropriate block and grid dimensions.

#include <cuda_runtime.h>

#define N 1024  // Matrix size (assuming square matrices)

if (row < n && col < n) {

float sum = 0.0f;

}

}

float *h_A, *h_B, *h_C;

cudaHostAlloc((void)&h_A, size, cudaHostAllocDefault);

h_A[i] = 1.0f;  // Example initialization

h_B[i] = 2.0f;  // Example initialization

cudaMalloc((void)&d_B, size);

cudaMalloc((void)&d_C, size);

dim3 blockSize(16, 16);  // Threads per block (16x16 = 256 threads per block)

dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y);

// Record start event

cudaEventRecord(start);

cudaEventSynchronize(stop);

// Calculate elapsed time

// Print timing results

std::cout << "GPU computation time: " << milliseconds << " ms" << std::endl;

std::cout << h_C[i * N + j] << " ";

}

cudaFree(d_B);

cudaFree(d_C);

cudaEventDestroy(start);

cudaEventDestroy(stop);

- Kernel Definition: `matrixMultiply` kernel computes the matrix multiplication for elements within the grid and block dimensions.

- Timing: CUDA events (`cudaEventRecord`, `cudaEventSynchronize`, `cudaEventElapsedTime`) are used to measure the execution time of the kernel.

