Week 5
OpenCL
This week we are
going to take a look at OpenCL (Open Computing Language)
http://en.wikipedia.org/wiki/Opencl
Machines are connected into
clusters through protocols such as MPI allowing large tasks to
be spread across multiple machines; CPUs now have multiple cores
able to run several threads simultaneously, and GPUs can now run
hundreds and thousands of lightweight threads simultaneously.
Right now
each of these levels are programmed independently and there are
various projects looking at trying to integrate them together
where the compiler and the code can adapt to the execution
environment.
OpenCL is
one example that has been gaining support from various vendors
to integrate work on the CPU and the GPU so its easier for
software to take advantage of any available GPUs. It may not be
the eventual "winner" but something similar will be a more
common way of programming GPUs for general work in the future.
There is a
short overview from AMD here:
http://www.amd.com/us/products/technologies/stream-technology/opencl/pages/opencl-intro.aspx
and one from
nvidia giving the relationship to CUDA
http://developer.download.nvidia.com/compute/cuda/3_2_prod/toolkit/docs/OpenCL_Programming_Guide.pdf
and if you
prefer watching a video instead:
This is a
very general (and somewhat long) introduction:
http://www.macresearch.org/files/opencl/Episode_1.mov
and here is
another series of short videos from AMD which we are going to go
through in class today
http://developer.amd.com/documentation/videos/OpenCLTechnicalOverviewVideoSeries/Pages/default.aspx#/Dev_OpenCL_1
The next step
is getting the CPUs and the GPUs more physically integrated - eg
Intel's Sandy Bridge, AMDs HSA (Heterogeneous Systems
Architecture) which will allow both kinds of processors to access
the same memory eliminating the major slowdown in moving data to
and from the GPU
andy galaxy
example code based off of the nvidia 'OpenCL Simple OpenGL
Interop' code available at
http://developer.nvidia.com/opencl-sdk-code-samples
oclAndyGalaxy.cl
oclAndyGalaxy.cpp
galaxy.txt
Makefile
Here is a
Hello World example from apple
//
// File: hello.c
//
// Abstract: A simple "Hello World" compute example
showing basic usage of OpenCL which
//
calculates the mathematical square (X[i] = pow(X[i],2)) for a
buffer of
//
floating point values.
//
// Version: <1.0>
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
////////////////////////////////////////////////////////////////////////////////
// Use a static data size for simplicity
//
#define DATA_SIZE (1024)
////////////////////////////////////////////////////////////////////////////////
// Simple compute kernel which computes the square of an input
array
//
const char *KernelSource = "\n" \
"__kernel void
square(
\n" \
" __global float*
input,
\n" \
" __global float*
output,
\n" \
" const unsigned int
count)
\n" \
"{
\n" \
" int i =
get_global_id(0);
\n" \
" if(i <
count)
\n" \
" output[i] = input[i] *
input[i];
\n" \
"}
\n" \
"\n";
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
int
err;
// error code returned from api calls
float
data[DATA_SIZE];
// original data set given to device
float
results[DATA_SIZE];
// results returned from device
unsigned int
correct;
// number of correct results returned
size_t
global;
// global domain size for our calculation
size_t
local;
// local domain size for our calculation
cl_device_id
device_id;
// compute device id
cl_context
context;
// compute context
cl_command_queue
commands; //
compute command queue
cl_program
program;
// compute program
cl_kernel
kernel;
// compute kernel
cl_mem
input;
// device memory used for the input array
cl_mem
output;
// device memory used for the output array
// Fill our data set with random float values
//
int i = 0;
unsigned int count = DATA_SIZE;
for(i = 0; i < count; i++)
data[i] = rand() /
(float)RAND_MAX;
// Connect to a compute device
// cl_int clGetDeviceIDs(
cl_platform_id platform,
//
cl_device_type device_type,
//
cl_uint
num_entries,
//
cl_device_id
* devices,
//
cl_uint *
num_devices)
//
// cl_device_type
Description
// CL_DEVICE_TYPE_CPU
- An OpenCL device
that is the host processor. The host processor runs the
//
OpenCL implementations and is a
single or multi-core CPU.
// CL_DEVICE_TYPE_GPU
- An OpenCL device that is a GPU.
By this we mean that the device can also
//
be used to accelerate a 3D API such
as OpenGL or DirectX.
//
CL_DEVICE_TYPE_ACCELERATOR - Dedicated
OpenCL accelerators (for example the IBM CELL Blade). These
devices
//
communicate with the host processor
using a peripheral interconnect such as PCIe.
// CL_DEVICE_TYPE_DEFAULT
- The default OpenCL device in the
system.
// CL_DEVICE_TYPE_ALL
- All OpenCL devices
available in the system.
int gpu = 1;
err = clGetDeviceIDs(NULL, gpu ?
CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed
to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
// cl_context
clCreateContext( const cl_context_properties
* properties,
//
cl_uint
num_devices,
//
const
cl_device_id *
devices,
//
(void
CL_CALLBACK * pfn_notify) (
//
const char
* errinfo,
//
const void
* private_info, size_t cb,
//
void
*
user_data
//
),
//
void
*
user_data,
//
cl_int
*
errcode_ret)
context = clCreateContext(0, 1, &device_id,
NULL, NULL, &err);
if (!context)
{
printf("Error: Failed
to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
// cl_command_queue clCreateCommandQueue(
cl_context
context,
//
cl_device_id
device,
//
cl_command_queue_properties properties,
//
cl_int *
errcode_ret)
commands = clCreateCommandQueue(context,
device_id, 0, &err);
if (!commands)
{
printf("Error: Failed
to create a command commands!\n");
return EXIT_FAILURE;
}
// Create the compute program from the source
buffer
//
program = clCreateProgramWithSource(context, 1,
(const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed
to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
// cl_int clBuildProgram (
cl_program
program,
//
cl_uint
num_devices,
//
const cl_device_id
* device_list,
//
const char *
options,
//
void (CL_CALLBACK
*pfn_notify)(cl_program program, void *user_data),
//
void *
user_data)
err = clBuildProgram(program, 0, NULL, NULL,
NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed
to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we
wish to run
// cl_kernel clCreateKernel (
cl_program program,
//
const char
* kernel_name,
//
cl_int
* errcode_ret)
kernel = clCreateKernel(program, "square",
&err);
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed
to create compute kernel!\n");
exit(1);
}
// Create the input and output arrays in device
memory for our calculation
//
// cl_mem clCreateBuffer
( cl_context context,
//
cl_mem_flags flags,
//
size_t
size,
//
void *
host_ptr,
//
cl_int *
errcode_ret)
//
// CL_MEM_READ_WRITE - CL_MEM_WRITE_ONLY -
CL_MEM_READ_ONLY -
// CL_MEM_USE_HOST_PTR - CL_MEM_ALLOC_HOST_PTR
- CL_MEM_COPY_HOST_PTR
input = clCreateBuffer(context,
CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context,
CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed
to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in
device memory
// cl_int clEnqueueWriteBuffer (
cl_command_queue command_queue,
//
cl_mem
buffer,
//
cl_bool
blocking_write,
//
size_t
offset,
//
size_t
cb,
//
const
void * ptr,
//
cl_uint
num_events_in_wait_list,
//
const
cl_event * event_wait_list,
//
cl_event
* event)
err = clEnqueueWriteBuffer(commands, input,
CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed
to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
// cl_int clSetKernelArg
( cl_kernel
kernel,
//
cl_uint
arg_index,
//
size_t
arg_size,
//
const void
* arg_value)
err = 0;
err = clSetKernelArg(kernel, 0,
sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1,
sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2,
sizeof(unsigned int), &count);
if (err != CL_SUCCESS)
{
printf("Error: Failed
to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for
executing the kernel on the device
// cl_int clGetKernelWorkGroupInfo
( cl_kernel
kernel,
//
cl_device_id
device,
//
cl_kernel_work_group_info param_name,
//
size_t
param_value_size,
//
void *
param_value,
//
size_t *
param_value_size_ret)
err = clGetKernelWorkGroupInfo(kernel,
device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local,
NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed
to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of
our 1d input data set
// using the maximum number of work group items
for this device
// cl_int clEnqueueNDRangeKernel
( cl_command_queue
command_queue,
//
cl_kernel
kernel,
//
cl_uint
work_dim,
//
const size_t *
global_work_offset,
//
const size_t *
global_work_size,
//
const size_t *
local_work_size,
//
cl_uint
num_events_in_wait_list,
//
const cl_event *
event_wait_list,
//
cl_event *
event)
global = count;
err = clEnqueueNDRangeKernel(commands, kernel,
1, NULL, &global, &local, 0, NULL, NULL);
if (err)
{
printf("Error: Failed
to execute kernel!\n");
return EXIT_FAILURE;
}
// Wait for the command commands to get
serviced before reading back results
// cl_int clFinish (
cl_command_queue command_queue)
clFinish(commands);
// Read back the results from the device to
verify the output
// cl_int clEnqueueReadBuffer
( cl_command_queue command_queue,
//
cl_mem
buffer,
//
cl_bool
blocking_read,
//
size_t
offset,
//
size_t
cb,
//
void
*
ptr,
//
cl_uint
num_events_in_wait_list,
//
const
cl_event * event_wait_list,
//
cl_event
* event)
err = clEnqueueReadBuffer( commands, output,
CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed
to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i = 0; i < count; i++)
{
if(results[i] ==
data[i] * data[i])
correct++;
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values!\n",
correct, count);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
http://researchdaily.blogspot.com/2009/12/vector-addition-on-cuda-and-opencl.html
Coming Next
Time
CUDA
last revision 2/21/12
- added in xcode version that should run on amd or nVidia GPUs on
the mac