I made the below program because, unfortunately, the “Simple OpenCL Example” that comes with the Nvidia GPU Computing SDK 3.0 documents does not work. Or at least it does not work on my combination of OS/GPU/Processor. I haven’t had time to test what is below on my ATi card yet, but so far I know that it works on Windows 7 64 bit, with a Nvidia 9600 Gt and Intel i7 processor.
Go to the CUDA 3.0 Downloads page for information on how to set up OpenCL when using an Nvidia card. You can get the libraries required for the below source at the “GPU Computing SDK code samples” link. Windows 7 users need to make a short leap of faith and download the Vista 32/64 bit developer drivers (which as the file it downloads seems to suggest also covers Windows 7). I am not, I think, using any of Nvidia’s proprietary helper files.
#include <stdio.h>
#include <stdlib.h>
#include <tchar.h>
#include <CL/cl.h>
// OpenCL source code
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a, __global int* b)\n",
"{\n",
" // Index of the elements to add\n",
" unsigned int n = get_global_id(0);\n",
" // Sum the n’th element of vectors a and b and store in c\n",
" c[n] = a[n] + b[n];\n",
"}\n"
};
// Some interesting data for the vectors
int InitialData1[20] = {37, 50, 54, 50, 56,0, 79, 112, 101, 110, 67, 76, 32, 43, 56, 100, 50, 25, 15, 17};
int InitialData2[20] = {35, 51, 54, 58, 55,32, 0, 0, 0, 0, 0, 0, 0, 44, 55, 14, 58, 75, 18, 15};
// number of elements in the vectors to be added
#define SIZE 2048
// Main function
// *********************************************************************
int _tmain(int argc, _TCHAR* argv[])
{
// Two integer source vectors in Host memory
int HostVector1[SIZE], HostVector2[SIZE];
// Initialize them some some interesting repeating data
for(int c = 0; c < SIZE; c++)
{
HostVector11 = InitialData11;
HostVector21 = InitialData21;
}
cl_int error = 0;
//Platform Information
cl_uint numPlatforms;
cl_platform_id* clSelectedPlatformID = NULL;
//get the number of available platforms
clGetPlatformIDs(0, NULL, &numPlatforms);
//alloc memory so we can get the whole list
clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms);
//get the list of available platforms
error = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL);
if(error != CL_SUCCESS) //we aren't going to bother with any error checking just now, just quit out
return 0;
//print out some debug info
printf("------------------------------------\n");
printf("Available platforms: %d.\n", numPlatforms);
char platform_info[128];
for(unsigned int i=0;i<numPlatforms; i++)
{
if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_NAME, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
printf("#%d CL_PLATFORM_NAME: %s\n", i+1, platform_info);
if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_PROFILE, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
printf("#%d CL_PLATFORM_PROFILE: %s\n", i+1, platform_info);
if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_VERSION, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
printf("#%d CL_PLATFORM_VERSION: %s\n", i+1, platform_info);
if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_VENDOR, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
printf("#%d CL_PLATFORM_VENDOR: %s\n", i+1, platform_info);
//the last one is probably quite long so this is just to giev an idea of how to do it
if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_EXTENSIONS, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
printf("#%d CL_PLATFORM_EXTENSIONS: %s ...etc\n", i+1, platform_info);
}
printf("------------------------------------\n");
//Device info
cl_uint ciDeviceCount;
cl_device_id* clDevices = NULL;
error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
if(error != CL_SUCCESS)
return 0;
clDevices = (cl_device_id*) malloc(sizeof(cl_device_id) * ciDeviceCount);
error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, ciDeviceCount, clDevices, &ciDeviceCount);
if(error != CL_SUCCESS)
return 0;
//debug info
printf("Available Devices: %d.\n",ciDeviceCount);
char device_info[128];
cl_uint device_value = 0;
for(unsigned int i=0;i<ciDeviceCount; i++)
{
if(clGetDeviceInfo(clDevices[i], CL_DEVICE_NAME, sizeof(char)*128, device_info, NULL) == CL_SUCCESS)
printf("#%d CL_DEVICE_NAME: %s\n", i+1, device_info);
if(clGetDeviceInfo(clDevices[i], CL_DRIVER_VERSION, sizeof(char)*128, device_info, NULL) == CL_SUCCESS)
printf("#%d CL_DRIVER_VERSION: %s\n", i+1, device_info);
if(clGetDeviceInfo(clDevices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), &device_value, NULL) == CL_SUCCESS)
printf("#%d CL_DEVICE_MAX_CLOCK_FREQUENCY: %dMHz\n", i+1, device_value);
if(clGetDeviceInfo(clDevices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &device_value, NULL) == CL_SUCCESS)
printf("#%d CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", i+1, device_value);
//there are lots more...you should get the idea
printf("etc...\n", i+1, device_value);
}
printf("------------------------------------\n");
//this bit just below came from some forum, i forgot which (sorry)
cl_context_properties props[3];
props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM; // indicates that next element is platform
props[1] = (cl_context_properties)clSelectedPlatformID[0]; // platform is of type cl_platform_id
props[2] = (cl_context_properties)0; // last element must be 0
//the below is mostly from the NVIDIA GPU COMPUTING SDK : NVIDIA_OpenCL_GettingStartedWindows.pdf
// Create a context to run OpenCL on our GPU
cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, &error);
// Get the list of GPU devices associated with this context
// we can use this, which might be better, or we can use one of the devices we
//found earler
size_t ParmDataBytes;
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);
// Create a command-queue on the first GPU device
cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL);
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector1, NULL);
cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector2, NULL);
// Allocate output memory on GPU
cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int) * SIZE, NULL, NULL);
// Create OpenCL program with source code
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, &error);
// Build the program (OpenCL JIT compilation)
error = clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
char *build_log;
size_t ret_val_size;
if(clGetProgramBuildInfo(OpenCLProgram, GPUDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) != CL_SUCCESS)
{
//if there was an error with the above source
//report the error and exit
build_log = new char[ret_val_size+1];
error = clGetProgramBuildInfo(OpenCLProgram, GPUDevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
printf("%s\n",build_log);
return 0;
}
// Create a handle to the compiled OpenCL function (Kernel)
cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&GPUOutputVector);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);
// Launch the Kernel on the GPU
size_t WorkSize[1] = {SIZE}; // one dimensional Range
cl_int temp = clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL);;
// Copy the output in GPU memory back to CPU memory
int HostOutputVector[SIZE];
clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(int), HostOutputVector, 0, NULL, NULL);
// Cleanup
free(clSelectedPlatformID);
free(clDevices);
free(GPUDevices);
clReleaseKernel(OpenCLVectorAdd);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(GPUCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(GPUVector1);
clReleaseMemObject(GPUVector2);
clReleaseMemObject(GPUOutputVector);
// Print out the results
printf("Results of OpenCL Kernel:\n");
for(int c = 0; c <305; c++)
printf("%c",(char)HostOutputVector[ c ] );
return 0;
}