当前位置：首页 > news >正文

深圳上市公司网站建设装饰公司响应式网站建设案例

news 2026/1/21 3:22:17

深圳上市公司网站建设,装饰公司响应式网站建设案例,今天出入苏州最新通知,好的网页制作公司有哪些随着OpenCL的普及#xff0c;现在有越来越多的移动设备以及平板、超级本等都支持OpenCL异构计算。而这些设备与桌面计算机、服务器相比而言性能不是占主要因素的#xff0c;反而能耗更受人关注。因此#xff0c;这些移动设备上的GPU与CPU基本都是在同一芯片上#xff08;So…随着OpenCL的普及现在有越来越多的移动设备以及平板、超级本等都支持OpenCL异构计算。而这些设备与桌面计算机、服务器相比而言性能不是占主要因素的反而能耗更受人关注。因此这些移动设备上的GPU与CPU基本都是在同一芯片上SoC或者GPU就已经成为了处理器的一部分像Intel Ivy Bridge架构开始的处理器Intel HD Graphics 4000开始支持OpenCLAMD APU等。因此在这些设备上做OpenCL的异构并行计算的话我们不需要像桌面端那些独立GPU那样要把主存数据通过PCIe搬运到GPU端然后等GPU计算结束后再搬回到主存。我们只需要将给GPU端分配的显存映射到主机端即可。这样在主机端我们也能直接通过指针来操作这块存储数据。下面编写了一个比较简单的例子来描述如何使用OpenCL的存储器映射特性。这个例子在MacBook AirmacOS 10.9.2下完成并通过Xcode 5.1Apple LLVM 5.1的编译与运行。硬件环境为Intel Core i7 4650U, Intel Graphics 5000, 8GB DDR3L, 128GB SSD。这是主机端代码C源文件 #include stdio.h #include string.h #include stdlib.h #include time.h#ifdef __APPLE__ #include OpenCL/opencl.h #else #include CL/cl.h #endifint main(void) {cl_int ret;cl_platform_id platform_id NULL;cl_device_id device_id NULL;cl_context context NULL;cl_command_queue command_queue NULL;cl_mem memObj NULL;char *kernelSource NULL;cl_program program NULL;cl_kernel kernel NULL;int *pHostBuffer NULL;clGetPlatformIDs(1, platform_id, NULL);if(platform_id NULL){puts(Get OpenCL platform failed!);goto FINISH;}clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, device_id, NULL);if(device_id NULL){puts(No GPU available as a compute device!);goto FINISH;}context clCreateContext(NULL, 1, device_id, NULL, NULL, ret);if(context NULL){puts(Context not established!);goto FINISH;}command_queue clCreateCommandQueue(context, device_id, 0, ret);if(command_queue NULL){puts(Command queue cannot be created!);goto FINISH;}// 指定内核源文件路径const char *pFileName /Users/zennychen/Downloads/test.cl;FILE *fp fopen(pFileName, r);if (fp NULL){puts(The specified kernel source file cannot be opened!);goto FINISH;}fseek(fp, 0, SEEK_END);const long kernelLength ftell(fp);fseek(fp, 0, SEEK_SET);kernelSource malloc(kernelLength);fread(kernelSource, 1, kernelLength, fp);fclose(fp);program clCreateProgramWithSource(context, 1, (const char**)kernelSource, (const size_t*)kernelLength, ret);ret clBuildProgram(program, 1, device_id, NULL, NULL, NULL);if (ret ! CL_SUCCESS){size_t len;char buffer[8 * 1024];printf(Error: Failed to build program executable!\n);clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, len);printf(%s\n, buffer);goto FINISH;}kernel clCreateKernel(program, test, ret);if(kernel NULL){puts(Kernel failed to create!);goto FINISH;}const size_t contentLength sizeof(*pHostBuffer) * 1024 * 1024;// 以下为在主机端分配输入缓存pHostBuffer malloc(contentLength);// 然后对此工作缓存进行初始化for(int i 0; i 1024 * 1024; i)pHostBuffer[i] i 1;// 这里预分配的缓存大小为4MB第一个参数是读写的memObj clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, contentLength, pHostBuffer, ret);if(memObj NULL){puts(Memory object1 failed to create!);goto FINISH;}ret clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)memObj);if(ret ! CL_SUCCESS){puts(Set arguments error!);goto FINISH;}// 做存储器映射int *pDeviceBuffer clEnqueueMapBuffer(command_queue, memObj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, contentLength, 0, NULL, NULL, ret);if(pDeviceBuffer NULL){puts(Memory map failed!);goto FINISH;}if(pDeviceBuffer ! pHostBuffer){// 若从GPU端映射得到的存储器地址与原先主机端的不同则将数据从主机端传递到GPU端ret clEnqueueWriteBuffer(command_queue, memObj, CL_TRUE, 0, contentLength, pHostBuffer, 0, NULL, NULL);if(ret ! CL_SUCCESS){puts(Data transfer failed);goto FINISH;}/** 如果主机端与设备端地址不同我们不妨测试一下设备端存储器的Cache情况 */// 先测试主机端的时间int sum 0;// 先过一遍存储器for(int j 0; j 1024; j)sum pHostBuffer[j];time_t t1 time(NULL);for(int i 0; i 1000000; i){for(int j 0; j 1024; j)sum pHostBuffer[j];}time_t t2 time(NULL);printf(The host delta time is: %f. The value is: %d\n, difftime(t2, t1), sum);// 测试设备端sum 0;// 先过一遍存储器for(int j 0; j 1024; j)sum pDeviceBuffer[j];t1 time(NULL);for(int i 0; i 1000000; i){for(int j 0; j 1024; j)sum pDeviceBuffer[j];}t2 time(NULL);printf(The device delta time is: %f. The value is: %d\n, difftime(t2, t1), sum);}else{// 若主机端与设备端存储器地址相同我们仅仅做CPU端测试int sum 0;// 先过一遍存储器for(int j 0; j 1024; j)sum pHostBuffer[j];time_t t1 time(NULL);for(int i 0; i 1000000; i){for(int j 0; j 1024; j)sum pHostBuffer[j];}time_t t2 time(NULL);printf(The host delta time is: %f. The value is: %d\n, difftime(t2, t1), sum);}// 这里指定将总共有1024 * 1024个work-itemret clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, (const size_t[]){1024 * 1024}, NULL, 0, NULL, NULL);// 做次同步这里偷懒不用wait event机制了clFinish(command_queue);// 做校验for(int i 0; i 1024 * 1024; i){if(pDeviceBuffer[i] ! (i 1) * 2){puts(Result error!);break;}}puts(Compute finished!);FINISH:/* Finalization */if(pHostBuffer ! NULL)free(pHostBuffer);if(kernelSource ! NULL)free(kernelSource);if(memObj ! NULL)clReleaseMemObject(memObj);if(kernel ! NULL)clReleaseKernel(kernel);if(program ! NULL)clReleaseProgram(program);if(command_queue ! NULL)clReleaseCommandQueue(command_queue);if(context ! NULL)clReleaseContext(context);return 0; }以下是OpenCL内核源代码 kernel void test(__global int *pInOut) {int index get_global_id(0);pInOut[index] pInOut[index]; }另外主机端代码部分中OpenCL源文件路径是写死的。各位朋友可以根据自己环境来重新指定路径。当然我们还可以修改主机端 clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, contentLength, pHostBuffer, ret); 这段创建存储器对象的属性。比如将 CL_MEM_USE_HOST_PTR 去掉。然后可以再试试效果。倘若 clCreateBuffer 的 flags 参数用的是 CL_MEM_ALLOC_HOST_PTR那么其 host_ptr 参数必须为空。在调用 clEnqueueMapBuffer 之后可以根据其返回的缓存地址对存储区域做数据初始化。 CL_MEM_ALLOC_HOST_PTR 表示应用程序暗示OpenCL实现从主机端可访问的存储空间给设备端分配存储缓存。这个与 CL_MEM_USE_HOST_PTR 还是有所区别的。CL_MEM_USE_HOST_PTR 是完全从应用端当前的内存池分配存储空间而 CL_MEM_ALLOC_HOST_PTR 对于CPU与GPU共享主存的环境下可以在CPU端留下一个访问GPU端VRAM的入口点。我们通过以下程序来测试当前环境的OpenCL实现以下代码在调用调用了 clEnqueueMapBuffer 函数之后做了缓存数据初始化的时间比较 long deltaTimes[10];for(int i 0; i 10; i){struct timeval tBegin, tEnd;gettimeofday(tBegin, NULL);for(int i 0; i 1024 * 1024; i)pDeviceBuffer[i] i 1;gettimeofday(tEnd, NULL);deltaTimes[i] 1000000 * (tEnd.tv_sec - tBegin.tv_sec ) tEnd.tv_usec - tBegin.tv_usec;}long useTime deltaTimes[0];for(int i 1; i 10; i){if(useTime deltaTimes[i])useTime deltaTimes[i];}printf(Device memory time spent: %ldus\n, useTime);int *pHostBuffer malloc(contentLength);for(int i 0; i 10; i){struct timeval tBegin, tEnd;gettimeofday(tBegin, NULL);for(int i 0; i 1024 * 1024; i)pHostBuffer[i] i 1;gettimeofday(tEnd, NULL);deltaTimes[i] 1000000 * (tEnd.tv_sec - tBegin.tv_sec ) tEnd.tv_usec - tBegin.tv_usec;}useTime deltaTimes[0];for(int i 1; i 10; i){if(useTime deltaTimes[i])useTime deltaTimes[i];}printf(Host memory time spent: %ldus\n, useTime);其中对 gettimeofday 的调用需要包含头文件 sys/time.h。这个函数所返回的时间可以精确到 μs微秒。在Intel Core i7 4650U, Intel Graphics 5000环境下花费时间差不多都是2.6ms毫秒。因此在内核真正执行的时候为了清空这部分存储空间的Cache驱动还是要做点工作的。当然驱动也可为这块内存区域分配 Write-Combined 类型的存储器这样主机端对这部分数据的访问不会被Cache尽管速度会慢很多但是通过 non-temporal Stream 方式读写还是会很不错。况且大部分OpenCL应用对同一块内存数据的读写都只有一次这么做也不会造成Cache污染。

查看全文

http://www.dnsts.com.cn/news/241603.html