
先给总结

使用的是 colab 的 T4 卡
在 jupyter notebook 中执行cuda代码的方法
# 查看当前 Colab/Notebook 分配到的 GPU 信息
!nvidia-smi
# 查看 nvcc 编译器版本,确认 CUDA 编译工具链可用
!nvcc --version
# 安装 nvcc4jupyter,让 notebook 可以直接运行 CUDA C/C++ 代码单元
!pip install nvcc4jupyter
# 加载 nvcc4jupyter 扩展,之后可以使用 %%cuda 魔法命令
%load_ext nvcc4jupyter
# 编写cuda代码
%%cuda
#include <stdio.h>
int main() {
// 最简单的 CUDA C 程序:这里只在 CPU 端打印一句话,用于测试编译和运行流程
printf("CUDA C is working!\n");
return 0;
}
# 将cuda代码写入文件
%%writefile hello.cu
#include <stdio.h>
int main() {
// 将 CUDA 源码写入 hello.cu,后续手动调用 nvcc 编译
printf("Hello from manually compiled CUDA!\n");
return 0;
}
# 使用 nvcc 手动编译 hello.cu,并生成可执行文件 hello
!nvcc hello.cu -o hello
# 运行编译后的程序
!./hello
直接给出完整代码
%%writefile vector_add.cu
#include <cuda_runtime.h>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <iostream>
#include <vector>
// CUDA API 调用检查宏:如果某个 CUDA 调用失败,打印文件、行号和错误信息后退出程序
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \
<< " code=" << static_cast<int>(err) \
<< " message=" << cudaGetErrorString(err) << std::endl; \
std::exit(EXIT_FAILURE); \
} \
} while (0)
// GPU 核函数:每个线程负责计算一个元素 c[idx] = a[idx] + b[idx]
__global__ void vectorAddKernel(const float* a, const float* b, float* c, int n) {
// blockIdx.x 表示当前线程块编号,blockDim.x 表示每个线程块的线程数,threadIdx.x 表示块内线程编号
// 三者组合得到当前线程在整个一维网格中的全局下标
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// 线程数量通常会向上取整,因此最后一个线程块可能有多余线程,需要防止越界访问
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
// CPU 版本的向量加法,用作性能对比和结果校验基准
void vectorAddCPU(const float* a, const float* b, float* c, int n) {
for (int i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
}
}
// 比较 CPU 和 GPU 的计算结果,允许一个很小的浮点误差 eps
bool checkResult(const std::vector<float>& cpu,
const std::vector<float>& gpu,
int n,
float eps = 1e-5f) {
for (int i = 0; i < n; ++i) {
if (std::fabs(cpu[i] - gpu[i]) > eps) {
std::cerr << "Mismatch at index " << i
<< ", CPU=" << cpu[i]
<< ", GPU=" << gpu[i] << std::endl;
returnfalse;
}
}
returntrue;
}
int main(int argc, char** argv) {
int n = 1 << 26; // 默认 67,108,864 个 float,大约每个数组 256MB
if (argc >= 2) {
n = std::atoi(argv[1]);
}
size_t bytes = static_cast<size_t>(n) * sizeof(float);
std::cout << "Vector size: " << n << " elements" << std::endl;
std::cout << "Array size : " << bytes / 1024.0 / 1024.0 << " MB each" << std::endl;
std::vector<float> h_a(n);
std::vector<float> h_b(n);
std::vector<float> h_c_cpu(n);
std::vector<float> h_c_gpu(n);
for (int i = 0; i < n; ++i) {
h_a[i] = static_cast<float>(i % 100) * 0.1f;
h_b[i] = static_cast<float>(i % 50) * 0.2f;
}
// -----------------------------
// CPU 计算计时
// -----------------------------
auto cpu_start = std::chrono::high_resolution_clock::now();
vectorAddCPU(h_a.data(), h_b.data(), h_c_cpu.data(), n);
auto cpu_end = std::chrono::high_resolution_clock::now();
double cpu_ms = std::chrono::duration<double, std::milli>(cpu_end - cpu_start).count();
std::cout << "\n[CPU]" << std::endl;
std::cout << "CPU time: " << cpu_ms << " ms" << std::endl;
// -----------------------------
// GPU 显存申请
// -----------------------------
float* d_a = nullptr;
float* d_b = nullptr;
float* d_c = nullptr;
CHECK_CUDA(cudaMalloc(&d_a, bytes));
CHECK_CUDA(cudaMalloc(&d_b, bytes));
CHECK_CUDA(cudaMalloc(&d_c, bytes));
cudaEvent_t total_start, total_stop;
cudaEvent_t kernel_start, kernel_stop;
CHECK_CUDA(cudaEventCreate(&total_start));
CHECK_CUDA(cudaEventCreate(&total_stop));
CHECK_CUDA(cudaEventCreate(&kernel_start));
CHECK_CUDA(cudaEventCreate(&kernel_stop));
CHECK_CUDA(cudaEventRecord(total_start));
// -----------------------------
// 数据从 CPU 拷贝到 GPU
// -----------------------------
CHECK_CUDA(cudaMemcpy(d_a, h_a.data(), bytes, cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(d_b, h_b.data(), bytes, cudaMemcpyHostToDevice));
// -----------------------------
// 启动 GPU kernel
// -----------------------------
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "\n[GPU Launch Config]" << std::endl;
std::cout << "threadsPerBlock = " << threadsPerBlock << std::endl;
std::cout << "blocksPerGrid = " << blocksPerGrid << std::endl;
// warmup
vectorAddKernel<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaDeviceSynchronize());
CHECK_CUDA(cudaEventRecord(kernel_start));
vectorAddKernel<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaEventRecord(kernel_stop));
CHECK_CUDA(cudaEventSynchronize(kernel_stop));
// -----------------------------
// 数据从 GPU 拷贝回 CPU
// -----------------------------
CHECK_CUDA(cudaMemcpy(h_c_gpu.data(), d_c, bytes, cudaMemcpyDeviceToHost));
CHECK_CUDA(cudaEventRecord(total_stop));
CHECK_CUDA(cudaEventSynchronize(total_stop));
float kernel_ms = 0.0f;
float total_ms = 0.0f;
CHECK_CUDA(cudaEventElapsedTime(&kernel_ms, kernel_start, kernel_stop));
CHECK_CUDA(cudaEventElapsedTime(&total_ms, total_start, total_stop));
bool ok = checkResult(h_c_cpu, h_c_gpu, n);
std::cout << "\n[GPU]" << std::endl;
std::cout << "GPU kernel time: " << kernel_ms << " ms" << std::endl;
std::cout << "GPU total time : " << total_ms << " ms" << std::endl;
// 向量加法每个元素大约读取 a、读取 b、写入 c,共 12 bytes
double kernel_bandwidth_gb_s = static_cast<double>(n) * 3.0 * sizeof(float) / (kernel_ms / 1000.0) / 1e9;
double total_bandwidth_gb_s = static_cast<double>(n) * 3.0 * sizeof(float) / (total_ms / 1000.0) / 1e9;
std::cout << "\n[Performance]" << std::endl;
std::cout << "Kernel effective bandwidth: " << kernel_bandwidth_gb_s << " GB/s" << std::endl;
std::cout << "Total effective bandwidth : " << total_bandwidth_gb_s << " GB/s" << std::endl;
std::cout << "Speedup kernel only vs CPU: " << cpu_ms / kernel_ms << "x" << std::endl;
std::cout << "Speedup total vs CPU : " << cpu_ms / total_ms << "x" << std::endl;
std::cout << "\n[Check]" << std::endl;
std::cout << "Result: " << (ok ? "PASS" : "FAIL") << std::endl;
CHECK_CUDA(cudaEventDestroy(total_start));
CHECK_CUDA(cudaEventDestroy(total_stop));
CHECK_CUDA(cudaEventDestroy(kernel_start));
CHECK_CUDA(cudaEventDestroy(kernel_stop));
CHECK_CUDA(cudaFree(d_a));
CHECK_CUDA(cudaFree(d_b));
CHECK_CUDA(cudaFree(d_c));
return ok ? 0 : 1;
}
执行 !nvcc -O3 -std=c++17 -arch=sm_75 vector_add.cu -o vector_add 编译
运行 !./vector_add 1048576
结果:
Vector size: 1048576 elements
Array size : 4 MB each
[CPU]
CPU time: 0.751115 ms
[GPU Launch Config]
threadsPerBlock = 256
blocksPerGrid = 4096
[GPU]
GPU kernel time: 0.053664 ms
GPU total time : 3.70288 ms
[Performance]
Kernel effective bandwidth: 234.476 GB/s
Total effective bandwidth : 3.39814 GB/s
Speedup kernel only vs CPU: 13.9966x
Speedup total vs CPU : 0.202846x
[Check]
Result: PASS

回答问题:
blockDim.x 由 kernel 启动参数 threadsPerBlock = 256 决定;进入 GPU kernel 后,CUDA 自动把这个值提供给每个线程使用。