网站页面总数,网站建设策划书案例,基于微信的网站开发,wordpress导航网站模板在2.0中进行了用一维网格和块对一维向量进行了求和。 在2.1中例化了二维的网格和块。 接下来进行2维网络#xff08;grid#xff09;和2维块#xff08;block#xff09;对矩阵进行求和。
#include stdio.h
#include stdlib.h
#include time.h
#i…在2.0中进行了用一维网格和块对一维向量进行了求和。 在2.1中例化了二维的网格和块。 接下来进行2维网络grid和2维块block对矩阵进行求和。
#include stdio.h
#include stdlib.h
#include time.h
#include string.h
#include windows.htypedef unsigned long DWORD;#define CHECK(call) \{\const cudaError_t error call; \if (error ! cudaSuccess)\{\printf(Error: %s: %d\n, __FILE__, __LINE__);\printf(code :%d reason :%s\n, error , cudaGetErrorString(error));\exit(1);\}\
}void checkResult(float *hostRef, float *gpuRef, const int N)
{double epsilon 1.0E-8;bool match 1;for (int i 0; i N; i){if (abs(hostRef[i] - gpuRef[i]) epsilon){match 0;printf(Array do not match\n);printf(host %5.2f gpu % 5.2f at current %d\n, hostRef[i], gpuRef[i], i);break;}}if (match) printf(array matches\n);
}void initialData(float *ip, int size)
{time_t t;srand((unsigned int) time(t));for (int i 0; i size; i) {ip[i] (float) (rand() 0xff) / 10.0f;}
}void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny){float *ia A;float *ib B;float *ic C;for (int iy 0; iy ny; iy){for (int ix 0; ix nx; ix){ic[ix] ia[ix] ib[ix];}ia nx; ib nx;ic nx;}
}__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny){unsigned int ix threadIdx.x blockIdx.x * blockDim.x;unsigned int iy threadIdx.y blockIdx.y * blockDim.y;unsigned int idx iy*nx ix;if (ix nx iy ny){MatC[idx] MatA[idx] MatB[idx];}
}int main(int argc , char **argv)
{printf(%s starting\n, argv[0]);int dev 0;cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(deviceprop,dev));printf(Using Device %d : %s\n, dev, deviceprop.name);CHECK(cudaSetDevice(dev));//set up dataint nx 114;int ny 114;int nxy nx * ny;size_t nBytes nxy * sizeof(float);printf(matrix size %d %d\n, nx, ny);float *h_A, *h_B, *hostRef, *gpuRef;h_A (float *) malloc (nBytes);h_B (float *) malloc (nBytes);hostRef (float *) malloc (nBytes);gpuRef (float *) malloc (nBytes);initialData(h_A, nxy);initialData(h_B, nxy);memset(hostRef,0, nBytes);memset(gpuRef,0, nBytes);// malloc device global memoryfloat *d_MatA, *d_MatB, *d_MatC;cudaMalloc((float**)d_MatA, nBytes);cudaMalloc((float**)d_MatB, nBytes);cudaMalloc((float**)d_MatC, nBytes);//transfer data from host to devicecudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);int dimx 32;int dimy 32;dim3 block(dimx, dimy);dim3 grid((nx block.x - 1)/block.x, (ny block.y - 1)/block.y);cudaEvent_t start, stop;cudaEventCreate(start);cudaEventCreate(stop);cudaEventRecord(start);sumMatrixOnGPU2Dgrid,block(d_MatA, d_MatB, d_MatC, nx, ny);cudaDeviceSynchronize();cudaEventRecord(stop);cudaEventSynchronize(stop);float milliseconds 0;cudaEventElapsedTime(milliseconds, start, stop);printf(execution config (%d,%d), (%d,%d)\n, grid.x,grid.y, block.x, block.y);printf(Kernel execution time: %f ms\n, milliseconds);cudaEventDestroy(start);cudaEventDestroy(stop);//copy kernel result back to hostcudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost);sumMatrixOnHost(h_A, h_B, hostRef, nx,ny);checkResult(hostRef, gpuRef, nxy);cudaFree(d_MatA);cudaFree(d_MatB);cudaFree(d_MatC);free(h_A);free(h_B);free(hostRef);free(gpuRef);return 0;
}基本流程和1维向量求和类似 输出结果 Using Device 0 : NVIDIA GeForce RTX 4090 matrix size 16384 16384 execution config (512,512), (32,32) Kernel execution time: 5.351136 ms array matches
block的尺寸为32x32。//block(dimx,dimy)定义的。 改变block尺寸为32x16: execution config (512,1024), (32,16) Kernel execution time: 3.778752 ms
进一步改变block尺寸为16x16: execution config (1024,1024), (16,16) Kernel execution time: 3.712736 ms
在之前尝试使用nvprof测试kernl性能时report Warning: nvprof is not supported on devices with compute capability 8.0 and higher.
参考 https://blog.csdn.net/TH_NUM/article/details/109952643 使用nsys 将C:\Program Files\NVIDIA Corporation\Nsight Systems 2024.5.1\target-windows-x64加入环境变量即可
nsys profile --statstrue .\sum_matrix_on_gpu_timer.exe输出
Collecting data...
Generating C:\Users\ADMINI~1\AppData\Local\Temp\nsys-report-ffa3.qdstrm
[1/8] [100%] report2.nsys-rep
[2/8] [100%] report2.sqlite
[3/8] Executing nvtx_sum stats report
SKIPPED: C:\Users\Administrator\Desktop\edward_temp\chapter2\report2.sqlite does not contain NV Tools Extension (NVTX) data.
[4/8] Executing osrt_sum stats report
SKIPPED: No data available.
[5/8] Executing cuda_api_sum stats reportTime (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name-------- --------------- --------- ----------- ---------- -------- --------- ----------- ----------------------93.3 321764988 3 107254996.0 91069908.0 83897570 146797510 34432084.1 cudaMemcpy4.0 13772507 3 4590835.7 4393180.0 3984976 5394351 725179.5 cudaFree1.5 5118078 3 1706026.0 1249576.0 819401 3049101 1182856.9 cudaMalloc1.0 3496955 1 3496955.0 3496955.0 3496955 3496955 0.0 cudaDeviceSynchronize0.1 459711 1 459711.0 459711.0 459711 459711 0.0 cudaLaunchKernel0.0 49593 2 24796.5 24796.5 707 48886 34067.7 cudaEventCreate0.0 22341 1 22341.0 22341.0 22341 22341 0.0 cuLibraryUnload0.0 18196 2 9098.0 9098.0 7920 10276 1665.9 cudaEventRecord0.0 15060 1 15060.0 15060.0 15060 15060 0.0 cudaEventSynchronize0.0 1961 1 1961.0 1961.0 1961 1961 0.0 cuCtxSynchronize0.0 1434 1 1434.0 1434.0 1434 1434 0.0 cuModuleGetLoadingMode0.0 1012 2 506.0 506.0 205 807 425.7 cudaEventDestroy 0.0 181 1 181.0 181.0 181 181 0.0 cuDeviceGetLuid[6/8] Executing cuda_gpu_kern_sum stats reportTime (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name-------- --------------- --------- --------- --------- -------- -------- ----------- -----------------------------------------------------100.0 3453326 1 3453326.0 3453326.0 3453326 3453326 0.0 sumMatrixOnGPU2D(float *, float *, float *, int, int)[7/8] Executing cuda_gpu_mem_time_sum stats reportTime (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation-------- --------------- ----- ---------- ---------- -------- -------- ----------- ----------------------------68.3 180949528 2 90474764.0 90474764.0 89939258 91010270 757319.8 [CUDA memcpy Host-to-Device]31.7 83834368 1 83834368.0 83834368.0 83834368 83834368 0.0 [CUDA memcpy Device-to-Host][8/8] Executing cuda_gpu_mem_size_sum stats reportTotal (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation---------- ----- -------- -------- -------- -------- ----------- ----------------------------2147.484 2 1073.742 1073.742 1073.742 1073.742 0.000 [CUDA memcpy Host-to-Device]1073.742 1 1073.742 1073.742 1073.742 1073.742 0.000 [CUDA memcpy Device-to-Host]Generated:C:\Users\Administrator\Desktop\edward_temp\chapter2\report2.nsys-repC:\Users\Administrator\Desktop\edward_temp\chapter2\report2.sqlite