OpenCL Simple Working Example

2014/12/21 19:51
I made the below program because, unfortunately, the “Simple OpenCL Example” that comes with the Nvidia GPU Computing SDK 3.0 documents does not work. Or at least it does not work on my combination of OS/GPU/Processor. I haven’t had time to test what is below on my ATi card yet, but so far I know that it works on Windows 7 64 bit, with a Nvidia 9600 Gt and Intel i7 processor.
Go to the CUDA 3.0 Downloads page for information on how to set up OpenCL when using an Nvidia card. You can get the libraries required for the below source at the “GPU Computing SDK code samples” link. Windows 7 users need to make a short leap of faith and download the Vista 32/64 bit developer drivers (which as the file it downloads seems to suggest also covers Windows 7). I am not, I think, using any of Nvidia’s proprietary helper files.
#include <stdio.h>
#include <stdlib.h>
#include <tchar.h>
#include <CL/cl.h>
 
 
// OpenCL source code
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a, __global int* b)\n",
"{\n",
"       // Index of the elements to add\n",
"       unsigned int n = get_global_id(0);\n",
"       // Sum the n’th element of vectors a and b and store in c\n",
"       c[n] = a[n] + b[n];\n",
"}\n"
};
 
// Some interesting data for the vectors
int InitialData1[20] = {37, 50, 54, 50, 56,0, 79, 112, 101, 110, 67, 76, 32, 43, 56, 100, 50, 25, 15, 17};
int InitialData2[20] = {35, 51, 54, 58, 55,32, 0, 0, 0, 0, 0, 0, 0, 44, 55, 14, 58, 75, 18, 15};
 
// number of elements in the vectors to be added
#define SIZE 2048
 
// Main function
// *********************************************************************
int _tmain(int argc, _TCHAR* argv[])
{  
    // Two integer source vectors in Host memory
    int HostVector1[SIZE], HostVector2[SIZE];
    // Initialize them some some interesting repeating data
    for(int c = 0; c < SIZE; c++)
    {
        HostVector11 = InitialData11;
        HostVector21 = InitialData21;
    }
 
    cl_int error = 0;
 
    //Platform Information
    cl_uint numPlatforms;
    cl_platform_id* clSelectedPlatformID = NULL;   
    //get the number of available platforms
    clGetPlatformIDs(0, NULL, &numPlatforms);  
    //alloc memory so we can get the whole list
    clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms);
    //get the list of available platforms
    error = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL);
 
    if(error != CL_SUCCESS) //we aren't going to bother with any error checking just now, just quit out
        return 0;
 
    //print out some debug info
    printf("------------------------------------\n");
    printf("Available platforms: %d.\n", numPlatforms);
    char platform_info[128];
    for(unsigned int i=0;i<numPlatforms; i++)
    {
        if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_NAME, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
            printf("#%d CL_PLATFORM_NAME: %s\n", i+1, platform_info);
 
        if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_PROFILE, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
            printf("#%d CL_PLATFORM_PROFILE: %s\n", i+1, platform_info);       
 
        if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_VERSION, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
            printf("#%d CL_PLATFORM_VERSION: %s\n", i+1, platform_info);       
 
        if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_VENDOR, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
            printf("#%d CL_PLATFORM_VENDOR: %s\n", i+1, platform_info);    
 
        //the last one is probably quite long so this is just to giev an idea of how to do it
        if(clGetPlatformInfo(clSelectedPlatformID[i], CL_PLATFORM_EXTENSIONS, sizeof(char)*128, platform_info, NULL)==CL_SUCCESS)
            printf("#%d CL_PLATFORM_EXTENSIONS: %s ...etc\n", i+1, platform_info);             
    }
    printf("------------------------------------\n");
 
    //Device info
    cl_uint ciDeviceCount;
    cl_device_id* clDevices =  NULL;
    error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
 
    if(error != CL_SUCCESS)
        return 0;
 
    clDevices = (cl_device_id*) malloc(sizeof(cl_device_id) * ciDeviceCount);
    error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, ciDeviceCount, clDevices, &ciDeviceCount);
 
    if(error != CL_SUCCESS)
        return 0;
 
    //debug info
    printf("Available Devices: %d.\n",ciDeviceCount);
    char device_info[128];
    cl_uint device_value = 0;
    for(unsigned int i=0;i<ciDeviceCount; i++)
    {
        if(clGetDeviceInfo(clDevices[i], CL_DEVICE_NAME, sizeof(char)*128, device_info, NULL) == CL_SUCCESS)
            printf("#%d CL_DEVICE_NAME: %s\n", i+1, device_info);
 
        if(clGetDeviceInfo(clDevices[i], CL_DRIVER_VERSION, sizeof(char)*128, device_info, NULL) == CL_SUCCESS)
            printf("#%d CL_DRIVER_VERSION: %s\n", i+1, device_info);
 
        if(clGetDeviceInfo(clDevices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), &device_value, NULL) == CL_SUCCESS)
            printf("#%d CL_DEVICE_MAX_CLOCK_FREQUENCY: %dMHz\n", i+1, device_value);
 
        if(clGetDeviceInfo(clDevices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &device_value, NULL) == CL_SUCCESS)
            printf("#%d CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", i+1, device_value);
 
        //there are lots more...you should get the idea
        printf("etc...\n", i+1, device_value); 
         
    }
    printf("------------------------------------\n");
 
    //this bit just below came from some forum, i forgot which (sorry)
    cl_context_properties props[3];
    props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM;      // indicates that next element is platform
    props[1] = (cl_context_properties)clSelectedPlatformID[0];  // platform is of type cl_platform_id
    props[2] = (cl_context_properties)0;                        // last element must be 0
 
    //the below is mostly from the NVIDIA GPU COMPUTING SDK : NVIDIA_OpenCL_GettingStartedWindows.pdf
    // Create a context to run OpenCL on our GPU
    cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, &error);
     
    // Get the list of GPU devices associated with this context
    // we can use this, which might be better, or we can use one of the devices we
    //found earler
    size_t ParmDataBytes;
    clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
    cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
    clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);
 
    // Create a command-queue on the first GPU device
    cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL);
 
    // Allocate GPU memory for source vectors AND initialize from CPU memory
    cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector1, NULL);
    cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector2, NULL);
 
    // Allocate output memory on GPU
    cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int) * SIZE, NULL, NULL);
 
    // Create OpenCL program with source code
    cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, &error);
 
    // Build the program (OpenCL JIT compilation)
    error = clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
    char *build_log;
    size_t ret_val_size;
    if(clGetProgramBuildInfo(OpenCLProgram, GPUDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) != CL_SUCCESS)
    {
        //if there was an error with the above source
        //report the error and exit
        build_log = new char[ret_val_size+1];
        error = clGetProgramBuildInfo(OpenCLProgram, GPUDevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
        build_log[ret_val_size] = '\0';
        printf("%s\n",build_log);
        return 0;
    }
 
    // Create a handle to the compiled OpenCL function (Kernel)
    cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL);
 
    // In the next step we associate the GPU memory with the Kernel arguments
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&GPUOutputVector);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);
 
    // Launch the Kernel on the GPU
    size_t WorkSize[1] = {SIZE}; // one dimensional Range
    cl_int temp = clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL);;
 
    // Copy the output in GPU memory back to CPU memory
    int HostOutputVector[SIZE];
    clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(int), HostOutputVector, 0, NULL, NULL);
 
    // Cleanup
    free(clSelectedPlatformID);
    free(clDevices);
    free(GPUDevices);
    clReleaseKernel(OpenCLVectorAdd);
    clReleaseProgram(OpenCLProgram);
    clReleaseCommandQueue(GPUCommandQueue);
    clReleaseContext(GPUContext);
    clReleaseMemObject(GPUVector1);
    clReleaseMemObject(GPUVector2);
    clReleaseMemObject(GPUOutputVector);
 
    // Print out the results
    printf("Results of OpenCL Kernel:\n");
    for(int c = 0; c <305; c++)
        printf("%c",(char)HostOutputVector[ c ] );
 
    return 0;
}