add endian option in cl tensor functions

This commit is contained in:
2024-02-02 23:13:22 +01:00
parent 70b1177e5f
commit 7dd749f85e
5 changed files with 127 additions and 6 deletions
+29
View File
@@ -25,6 +25,35 @@ __kernel void prodTensori2dLin_TYPE_DOUBLE(long unsigned int M1rank, __global co
}
__kernel void prodTensor2dLinNotEndian_TYPE_FLOAT(long unsigned int M0rank, __global const float *M0x , __global const float *M1x, __global float *Mx ){
//Get the index of the current element to be processed
size_t i = get_group_id(0)*get_local_size(0) + get_local_id(0);
size_t j = get_group_id(1)*get_local_size(1) + get_local_id(1);
//size_t i = get_global_id(0);
//size_t j = get_global_id(1);
size_t k = i + M0rank * j;
Mx[k] = M0x[i] * M1x[j];
}
__kernel void prodTensori2dLinNotEndian_TYPE_DOUBLE(long unsigned int M0rank, __global const double *M0x , __global const double *M1x, __global double *Mx ){
//Get the index of the current element to be processed
size_t i = get_group_id(0)*get_local_size(0) + get_local_id(0);
size_t j = get_group_id(1)*get_local_size(1) + get_local_id(1);
//size_t i = get_global_id(0);
//size_t j = get_global_id(1);
size_t k = i + M0rank * j;
Mx[k] = M0x[i] * M1x[j];
}
@@ -34,3 +34,37 @@ __kernel void prodContractnTensorLin_TYPE_DOUBLE(long unsigned int dSubRank, lon
__kernel void prodContractnTensorLinNotEndian_TYPE_FLOAT(long unsigned int dSubRank, long unsigned int dMRank, __global const float *M0x , __global const float *M1x, __global float *Mx ){
//Get the index of the current element to be processed
size_t i = get_global_id(0);
size_t k, a0_id, a1_id, n0_id, n1_id;
a0_id = i % dSubRank;
a1_id = i / dSubRank;
Mx[i] = 0;
for (k = 0; k < dMRank; k++) {
n0_id = a0_id + dSubRank * k;
n1_id = a1_id * dMRank + k;
Mx[i] += M0x[n0_id] * M1x[n1_id];
}
}
__kernel void prodContractnTensorLinNotEndian_TYPE_DOUBLE(long unsigned int dSubRank, long unsigned int dMRank, __global const double *M0x , __global const double *M1x, __global double *Mx ){
//Get the index of the current element to be processed
size_t i = get_global_id(0);
size_t k, a0_id, a1_id, n0_id, n1_id;
a0_id = i % dSubRank;
a1_id = i / dSubRank;
Mx[i] = 0;
for (k = 0; k < dMRank; k++) {
n0_id = a0_id + dSubRank * k;
n1_id = a1_id * dMRank + k;
Mx[i] += M0x[n0_id] * M1x[n1_id];
}
}
+19
View File
@@ -16,6 +16,25 @@ __kernel void prodTensorLin_TYPE_DOUBLE(long unsigned int M1rank, __global const
Mx[k] = M0x[i] * M1x[j];
}
__kernel void prodTensorLinNotEndian_TYPE_FLOAT(long unsigned int M0rank, __global const float *M0x , __global const float *M1x, __global float *Mx ){
//Get the index of the current element to be processed
size_t k = get_global_id(0);
size_t i = k % M0rank;
size_t j = k / M0rank;
Mx[k] = M0x[i] * M1x[j];
}
__kernel void prodTensorLinNotEndian_TYPE_DOUBLE(long unsigned int M0rank, __global const double *M0x , __global const double *M1x, __global double *Mx ){
//Get the index of the current element to be processed
size_t k = get_global_id(0);
size_t i = k % M0rank;
size_t j = k / M0rank;
Mx[k] = M0x[i] * M1x[j];
}
+44 -6
View File
@@ -146,10 +146,22 @@ void cl_tensorProd_##type(tensor_##type **MM, tensor_##type *M0, tensor_##type *
(*MM)=CREATE_TENSOR_##type(dd); \
tensor_##type *M = *MM; \
char *file_cl_src = "../src/kernel_ProdTensor.cl"; \
char *func_cl_name = "prodTensorLin_" #type; \
char *func_cl_nameEndian = "prodTensorLin_" #type; \
char *func_cl_nameNotEndian = "prodTensorLinNotEndian_" #type; \
char *func_cl_name; \
size_t MeDimRank;\
if(endian){\
func_cl_name = func_cl_nameEndian;\
MeDimRank = M1->dim->rank;\
\
}else{\
func_cl_name = func_cl_nameNotEndian;\
MeDimRank = M0->dim->rank;\
\
}\
SETUP_cl_KERNEL_(type,file_cl_src,func_cl_name);\
/*/ Set the arguments of the kernel */ \
ret = clSetKernelArg(kernel, 0, sizeof(size_t), (void *)&(M1->dim->rank)); \
ret = clSetKernelArg(kernel, 0, sizeof(size_t), (void *)&MeDimRank); \
ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&M0_mem_obj); \
ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&M1_mem_obj); \
ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&M_mem_obj); \
@@ -188,11 +200,24 @@ void cl_tensorContractnProd_##type(tensor_##type** MM, tensor_##type *M0, tensor
*MM = CREATE_TENSOR_##type(dd);\
tensor_##type *M= *MM;\
char *file_cl_src = "../src/kernel_ProdContractnTensor.cl"; \
char *func_cl_name = "prodContractnTensorLin_" #type; \
/*char *func_cl_name = "prodContractnTensorLin_" #type;*/ \
char *func_cl_nameEndian = "prodContractnTensorLin_" #type; \
char *func_cl_nameNotEndian = "prodContractnTensorLinNotEndian_" #type; \
char *func_cl_name; \
size_t dSubRank;\
if(endian){\
func_cl_name = func_cl_nameEndian;\
dSubRank = dSub1->rank;\
\
}else{\
func_cl_name = func_cl_nameNotEndian;\
dSubRank = dSub0->rank;\
\
}\
SETUP_cl_KERNEL_(type,file_cl_src,func_cl_name);\
\
/*/ Set the arguments of the kernel */ \
ret = clSetKernelArg(kernel, 0, sizeof(size_t), (void *)&(dSub1->rank)); \
ret = clSetKernelArg(kernel, 0, sizeof(size_t), (void *)&dSubRank); \
ret |= clSetKernelArg(kernel, 1, sizeof(size_t), (void *)&(dM->rank)); \
ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&M0_mem_obj); \
ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&M1_mem_obj); \
@@ -211,10 +236,23 @@ void cl2d_tensorProd_##type(tensor_##type **MM, tensor_##type *M0, tensor_##type
(*MM)=CREATE_TENSOR_##type(dd); \
tensor_##type *M = *MM; \
char *file_cl_src = "../src/kernel_2d_ProdTensor.cl"; \
char *func_cl_name = "prodTensor2dLin_" #type; \
/*char *func_cl_name = "prodTensor2dLin_" #type;*/ \
char *func_cl_nameEndian = "prodTensor2dLin_" #type; \
char *func_cl_nameNotEndian = "prodTensor2dLinNotEndian_" #type; \
char *func_cl_name; \
size_t MeDimRank;\
if(endian){\
func_cl_name = func_cl_nameEndian;\
MeDimRank = M1->dim->rank;\
\
}else{\
func_cl_name = func_cl_nameNotEndian;\
MeDimRank = M0->dim->rank;\
\
}\
SETUP_cl_KERNEL_(type,file_cl_src,func_cl_name);\
/*/ Set the arguments of the kernel */ \
ret = clSetKernelArg(kernel, 0, sizeof(size_t), (void *)&(M1->dim->rank)); \
ret = clSetKernelArg(kernel, 0, sizeof(size_t), (void *)&(MeDimRank)); \
ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&M0_mem_obj); \
ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&M1_mem_obj); \
ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&M_mem_obj); \