add opencl dir

2024-01-05 20:40:37 +01:00
parent fd121e7979
commit f5b00f662c
4 changed files with 297 additions and 0 deletions
@@ -0,0 +1,138 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "ftest/ftest.h"
+
+
+#define VECTOR_SIZE 1024
+
+const char *saxpy_kernel = 
+"__kernel                                 \n"
+"void saxpy_kernel(float alpha,           \n"
+"                  __global float *A,     \n"
+"                  __global float *B,     \n"
+"                  __global float *C)     \n"
+"{                                        \n"
+" // Get the index of the work-item       \n"
+" int index = get_global_id(0);           \n"
+" C[index] = alpha * A[index] + B[index]; \n"
+"}                                        \n";
+
+TEST(openCL_one){
+  int i;
+  // Alocate space for vectors A, B, C
+  float alpha = 3.0;
+  float *A = (float*)malloc(sizeof(float)*VECTOR_SIZE);
+  float *B = (float*)malloc(sizeof(float)*VECTOR_SIZE);
+  float *C = (float*)malloc(sizeof(float)*VECTOR_SIZE);
+
+  for(i = 0; i<VECTOR_SIZE; ++i){
+    A[i]=i;
+    B[i]=VECTOR_SIZE - i;
+    C[i]=0;
+  }
+
+  // Get platform and device information
+  cl_platform_id * platforms = NULL;
+  cl_uint num_platforms;
+  // Set up the Platform
+  cl_int clStatus = clGetPlatformIDs(0,NULL,&num_platforms);
+  platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
+  clStatus = clGetPlatformIDs(num_platforms,platforms,NULL);
+
+  for(i=0; i< num_platforms; ++i){
+    printf("--- %d/%d : %d\n",i,num_platforms,platforms[i]);
+  }
+  // Get the devices list and chose the device you want to run on
+  cl_device_id *device_list=NULL;
+  cl_uint num_devices;
+
+  clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
+  device_list = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
+  clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices,device_list,NULL);
+
+  // Create one OpenCL context for each device in the platform
+  cl_context context;
+  context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus);
+
+  for(i=0; i< num_devices; ++i){
+    printf("--- %d/%d : %d\n",i,num_devices,device_list[i]);
+  }
+  // Create a command queue
+  cl_command_queue command_queue = clCreateCommandQueue(context, device_list[0], 0, &clStatus);
+
+  // Create memory buffers on the device for each vector
+  cl_mem A_clmem = clCreateBuffer(context,CL_MEM_READ_ONLY, VECTOR_SIZE*sizeof(float),NULL,&clStatus);
+  cl_mem B_clmem = clCreateBuffer(context,CL_MEM_READ_ONLY, VECTOR_SIZE*sizeof(float),NULL,&clStatus);
+  cl_mem C_clmem = clCreateBuffer(context,CL_MEM_WRITE_ONLY, VECTOR_SIZE*sizeof(float),NULL,&clStatus);
+
+  // COpy the buffer A, B to the device
+  clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE*sizeof(float),A,0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE*sizeof(float),B,0,NULL,NULL);
+  
+  // Create a program from the kernel source
+  cl_program program = clCreateProgramWithSource(context,1,(const char **)&saxpy_kernel,NULL,&clStatus);
+
+  // Buil the program
+  clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL);
+
+
+  // Create the OpenCL kernel
+  cl_kernel kernel = clCreateKernel(program, "saxpy_kernel",&clStatus);
+
+  // Set the arguments of the kernel
+  clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void*)&alpha);
+  clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&A_clmem);
+  clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&B_clmem);
+  clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&C_clmem);
+
+  // Execute the OpenCL kernel on the list
+  size_t global_size = VECTOR_SIZE; // Process the entire lists
+  size_t local_size = 64;           // Process one item at a time
+  clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
+
+  // Read the cl memory C_clmem on device to the host variable C
+  clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE*sizeof(float),C,0,NULL,NULL);
+
+  // Clean up and wait for all the commands to complete
+  clStatus = clFlush(command_queue);
+  clStatus = clFinish(command_queue);
+
+  // Display the result to the screen
+  for (i=0; i<VECTOR_SIZE; ++i){
+//    EXPECT_EQ_TYPE_FLOAT(alpha*A[i]+B[i],C[i]);
+//    printf("%d: %f * %f + %f = %f \n",i,alpha, A[i],B[i],C[i]);
+  }
+  // Finally release all OpenCL allocated objects and host buffers.
+  clStatus = clReleaseKernel(kernel);
+  clStatus = clReleaseProgram(program);
+
+  clStatus = clReleaseMemObject(A_clmem);
+  clStatus = clReleaseMemObject(B_clmem);
+  clStatus = clReleaseMemObject(C_clmem);
+
+  clStatus = clReleaseCommandQueue(command_queue);
+  clStatus = clReleaseContext(context);
+
+  free(A);
+  free(B);
+  free(C);
+  free(platforms);
+  free(device_list);
+
+
+}
+int main(int argc, char **argv){
+
+
+  run_all_tests_args(argc, argv);
+
+  return 0;
+}
+
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+if [ "$#" -le 0 ] ; then
+  echo "Usage: $0 is_good.c" >&2
+  echo "for example to compile: is_good.c" >&2
+  exit 1
+fi
+
+if [ "$#" -le 1 ] ; then
+  echo "Usage: $0 $1" >&2
+  echo "we can add more option for example '-D DEBUG=1' to have debug print of PRINT_DEBUG_ (tools_t macro), notice that PRINT_DEBUG is provide by ytest and can be activate with --debug option on runtime." 
+  echo "The other compile option is '-g' to have gbd, and so on..."
+  echo "for example: $0 $1 \"-D DEBUG=1 -g\""
+fi
+
+YTESTDIR=$PWD/../ytest_t
+
+gcc -o launch_is_good_c $1 -L$YTESTDIR $2 -lytest -lOpenCL -I$YTESTDIR/include_ytest/include  
+echo "gcc -o launch_is_good_c $1 -L$YTESTDIR $2 -lytest -I$YTESTDIR/include_ytest/include"  
+
+export LD_LIBRARY_PATH=$YTESTDIR:LD_LIBRARY_PATH
+
+
@@ -0,0 +1,128 @@
+#include "ftest/ftest.h"
+#include "fmock/fmock.h"
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// #define CL_USE_DEPRECATED_OPENCL_1_2_APIS //
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+ 
+#define MAX_SOURCE_SIZE (0x100000)
+ 
+TEST(openCL) {
+    // Create the two input vectors
+    int i;
+    float alpha = 2.0;
+    const int LIST_SIZE = 1024;
+    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
+    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
+    for(i = 0; i < LIST_SIZE; i++) {
+        A[i] = i;
+        B[i] = LIST_SIZE - i;
+    }
+ 
+    // Load the kernel source code into the array source_str
+    FILE *fp;
+    char *source_str;
+    size_t source_size;
+ 
+    fp = fopen("vector_add_kernel.cl", "r");
+    if (!fp) {
+        fprintf(stderr, "Failed to load kernel.\n");
+        exit(1);
+    }
+    source_str = (char*)malloc(MAX_SOURCE_SIZE);
+    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
+    fclose( fp );
+ 
+    // Get platform and device information
+    cl_platform_id platform_id = NULL;
+    cl_device_id device_id = NULL;   
+    cl_uint ret_num_devices;
+    cl_uint ret_num_platforms;
+    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
+    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, 
+            &device_id, &ret_num_devices);
+ 
+    // Create an OpenCL context
+    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
+ 
+    // Create a command queue
+    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
+
+
+    // Create memory buffers on the device for each vector 
+    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
+            LIST_SIZE * sizeof(int), NULL, &ret);
+    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
+            LIST_SIZE * sizeof(int), NULL, &ret);
+    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
+            LIST_SIZE * sizeof(int), NULL, &ret);
+ 
+    // Copy the lists A and B to their respective memory buffers
+    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
+            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
+    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
+            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
+ 
+    // Create a program from the kernel source
+    cl_program program = clCreateProgramWithSource(context, 1, 
+            (const char **)&source_str, (const size_t *)&source_size, &ret);
+ 
+    // Build the program
+    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+ 
+    // Create the OpenCL kernel
+    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
+ 
+    // Set the arguments of the kernel
+    ret = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha);
+    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&a_mem_obj);
+    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&b_mem_obj);
+    ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&c_mem_obj);
+ 
+    // Execute the OpenCL kernel on the list
+    size_t global_item_size = LIST_SIZE; // Process the entire lists
+    size_t local_item_size = 64; // Divide work items into groups of 64
+    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
+            &global_item_size, &local_item_size, 0, NULL, NULL);
+ 
+    // Read the memory buffer C on the device to the local variable C
+    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
+    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
+            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
+ 
+    // Display the result to the screen
+    for(i = 0; i < LIST_SIZE; i++)
+        EXPECT_EQ( alpha * A[i] + B[i], C[i]);
+        //printf("%d + %d = %d\n", A[i], B[i], C[i]);
+ 
+    // Clean up
+    ret = clFlush(command_queue);
+    ret = clFinish(command_queue);
+    ret = clReleaseKernel(kernel);
+    ret = clReleaseProgram(program);
+    ret = clReleaseMemObject(a_mem_obj);
+    ret = clReleaseMemObject(b_mem_obj);
+    ret = clReleaseMemObject(c_mem_obj);
+    ret = clReleaseCommandQueue(command_queue);
+    ret = clReleaseContext(context);
+    free(A);
+    free(B);
+    free(C);
+}
+
+int main(int argc, char **argv){
+
+
+  run_all_tests_args(argc, argv);
+
+  return 0;
+}
+
@@ -0,0 +1,8 @@
+__kernel void vector_add(float alpha, __global const int *A, __global const int *B, __global int *C) {
+ 
+    // Get the index of the current element to be processed
+    int i = get_global_id(0);
+ 
+    // Do the operation
+    C[i] = alpha * A[i] + B[i];
+}