mv tensor repo
This commit is contained in:
@@ -0,0 +1,493 @@
|
||||
/*#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "cuda.h"
|
||||
#include "cuda_runtime.h"
|
||||
*/
|
||||
|
||||
#include "d_tensCuda.h"
|
||||
//#include "index.h"
|
||||
#include <stdio.h>
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
|
||||
//1D grid of 1D blocks
|
||||
__device__
|
||||
int d_getGlobalIdx_1D_1D() {
|
||||
return blockIdx.x * blockDim.x + threadIdx.x;
|
||||
}
|
||||
//1D grid of 2D blocks
|
||||
__device__
|
||||
int d_getGlobalIdx_1D_2D() {
|
||||
return blockIdx.x * blockDim.x * blockDim.y
|
||||
+ threadIdx.y * blockDim.x + threadIdx.x;
|
||||
}
|
||||
//1D grid of 3D blocks
|
||||
__device__
|
||||
int d_getGlobalIdx_1D_3D() {
|
||||
return blockIdx.x * blockDim.x * blockDim.y * blockDim.z
|
||||
+ threadIdx.z * blockDim.y * blockDim.x
|
||||
+ threadIdx.y * blockDim.x + threadIdx.x;
|
||||
}
|
||||
//2D grid of 1D blocks
|
||||
__device__ int d_getGlobalIdx_2D_1D() {
|
||||
int blockId
|
||||
= blockIdx.y * gridDim.x + blockIdx.x;
|
||||
int threadId = blockId * blockDim.x + threadIdx.x;
|
||||
return threadId;
|
||||
}
|
||||
//2D grid of 2D blocks
|
||||
__device__
|
||||
int d_getGlobalIdx_2D_2D() {
|
||||
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
|
||||
int threadId = blockId * (blockDim.x * blockDim.y)
|
||||
+ (threadIdx.y * blockDim.x) + threadIdx.x;
|
||||
return threadId;
|
||||
}
|
||||
//2D grid of 3D blocks
|
||||
__device__
|
||||
int d_getGlobalIdx_2D_3D() {
|
||||
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
|
||||
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
|
||||
+ (threadIdx.z * (blockDim.x * blockDim.y))
|
||||
+ (threadIdx.y * blockDim.x) + threadIdx.x;
|
||||
return threadId;
|
||||
}
|
||||
//3D grid of 1D blocks
|
||||
__device__
|
||||
int d_getGlobalIdx_3D_1D() {
|
||||
int blockId = blockIdx.x + blockIdx.y * gridDim.x
|
||||
+ gridDim.x * gridDim.y * blockIdx.z;
|
||||
int threadId = blockId * blockDim.x + threadIdx.x;
|
||||
return threadId;
|
||||
}
|
||||
//3D grid of 2D blocks
|
||||
__device__
|
||||
int d_getGlobalIdx_3D_2D() {
|
||||
int blockId = blockIdx.x + blockIdx.y * gridDim.x
|
||||
+ gridDim.x * gridDim.y * blockIdx.z;
|
||||
int threadId = blockId * (blockDim.x * blockDim.y)
|
||||
+ (threadIdx.y * blockDim.x) + threadIdx.x;
|
||||
return threadId;
|
||||
}
|
||||
//3D grid of 3D blocks
|
||||
__device__
|
||||
int d_getGlobalIdx_3D_3D() {
|
||||
int blockId = blockIdx.x + blockIdx.y * gridDim.x
|
||||
+ gridDim.x * gridDim.y * blockIdx.z;
|
||||
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
|
||||
+ (threadIdx.z * (blockDim.x * blockDim.y))
|
||||
+ (threadIdx.y * blockDim.x) + threadIdx.x;
|
||||
return threadId;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
__device__ void d_LinearToCoordEnd(int* ret, size_t lin, int* dim, int rank, size_t size) {
|
||||
size_t sm = lin;
|
||||
size_t pp = size;
|
||||
for (int i = rank - 1;i > 0; --i) {
|
||||
pp /= dim[i];
|
||||
ret[i] = sm / pp;
|
||||
sm %= pp;
|
||||
}
|
||||
ret[0] = sm;
|
||||
}
|
||||
|
||||
__device__ size_t d_CoordToLinearEnd(int* coo, int* dim, int rank) {
|
||||
size_t pp = 1;
|
||||
size_t sm = 0;
|
||||
for (int i = 0; i < rank; ++i) {
|
||||
sm += (coo[i] * pp);
|
||||
pp *= dim[i];
|
||||
}
|
||||
return sm;
|
||||
}
|
||||
|
||||
__device__ size_t d_CoordToLinear(int* coo, int* dim, int rank) {
|
||||
size_t pp = 1;
|
||||
size_t sm = 0;
|
||||
for (int i = rank - 1; i >= 0; --i) {
|
||||
sm += (coo[i] * pp);
|
||||
pp *= dim[i];
|
||||
}
|
||||
return sm;
|
||||
}
|
||||
|
||||
|
||||
|
||||
__device__ void d_LinearToCoord(int* ret, size_t lin, int* dim, int rank, size_t size) {
|
||||
size_t sm = lin;
|
||||
size_t pp = size;
|
||||
for (int i = 0; i < rank - 1; ++i) {
|
||||
pp /= dim[i];
|
||||
ret[i] = sm / pp;
|
||||
sm %= pp;
|
||||
}
|
||||
ret[rank - 1] = sm;
|
||||
}
|
||||
/*__device__ void d_LinearToSplitSubrankLimSz(size_t& part0, size_t& part1, size_t lin, int* dim, int rank, int rankA, size_t size, size_t sizeA) {
|
||||
size_t sm = lin;
|
||||
size_t pp = size;
|
||||
size_t s = 0;
|
||||
size_t p = sizeA;
|
||||
int ret;// = new int[rank];
|
||||
for (int i = 0; i < rank; ++i) {
|
||||
pp /= dim[i];
|
||||
ret = sm / pp;
|
||||
p /= dim[i];
|
||||
s += ret * p;
|
||||
|
||||
sm %= pp;
|
||||
if (i == rankA - 1) {
|
||||
part0 = s;
|
||||
s = 0;
|
||||
p = size / sizeA;
|
||||
}
|
||||
|
||||
}
|
||||
part1 = s;
|
||||
|
||||
}*/
|
||||
__device__ void d_LinearToSplitSubrankLimSz(size_t& part0, size_t& part1, size_t lin, int* dim, int rank, int rankA, size_t size, size_t sizeA) {
|
||||
size_t sm = lin;
|
||||
size_t pp = size;
|
||||
size_t s = 0;
|
||||
size_t p = sizeA;
|
||||
int ret;// = new int[rank];
|
||||
int i;
|
||||
for (i = 0; i < rankA; ++i) {
|
||||
pp /= dim[i];
|
||||
ret = sm / pp;
|
||||
p /= dim[i];
|
||||
s += ret * p;
|
||||
|
||||
sm %= pp;
|
||||
|
||||
}
|
||||
part0 = s;
|
||||
s = 0;
|
||||
p = size / sizeA;//sizeB
|
||||
for (; i < rank; ++i) {
|
||||
pp /= dim[i];
|
||||
ret = sm / pp;
|
||||
p /= dim[i];
|
||||
s += ret * p;
|
||||
|
||||
sm %= pp;
|
||||
|
||||
}
|
||||
|
||||
part1 = s;
|
||||
|
||||
}
|
||||
__device__ void d_LinearToSplitSubrankLimSzEnd(size_t& part0, size_t& part1, size_t lin, int* dim, int rank, int rankA, size_t size, size_t sizeA) {
|
||||
size_t sm = lin;
|
||||
size_t pp = size;
|
||||
size_t s = 0;
|
||||
size_t p = sizeA;
|
||||
int ret;// = new int[rank];
|
||||
for (int i = rank - 1; i >= 0; --i) {
|
||||
pp /= dim[i];
|
||||
ret = sm / pp;
|
||||
p /= dim[i];
|
||||
s += ret * p;
|
||||
|
||||
sm %= pp;
|
||||
if (i == rankA) {
|
||||
part1 = s;
|
||||
s = 0;
|
||||
p = size / sizeA;
|
||||
}
|
||||
|
||||
}
|
||||
part0 = s;
|
||||
|
||||
}
|
||||
|
||||
|
||||
__device__ void d_subArray(int* dst, int* src, int debDst, int finDst, int debSrc) {
|
||||
for (int i = debDst; i < finDst; i++) {
|
||||
dst[i] = src[i + debSrc];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void d_prodTensor(T* C, int* dimC, int rankC, size_t size, T* A, int* dimA, int rankA, size_t sizeA, T* B, int* dimB, int rankB) {
|
||||
size_t lin0, lin1;
|
||||
|
||||
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i < size) {
|
||||
d_LinearToSplitSubrankLimSz(lin0, lin1, i, dimC, rankC, rankA, size, sizeA);
|
||||
|
||||
C[i] = A[lin0] * B[lin1];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template __global__ void d_prodTensor<float>(float* C, int* dimC, int rankC, size_t size, float* A, int* dimA, int rankA, size_t sizeA, float* B, int* dimB, int rankB);
|
||||
|
||||
template<typename T>
|
||||
__global__ void d_prodTensorEnd(T* C, int* dimC, int rankC, size_t size, T* A, int* dimA, int rankA, size_t sizeA, T* B, int* dimB, int rankB) {
|
||||
size_t lin0, lin1;
|
||||
|
||||
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i < size) {
|
||||
d_LinearToSplitSubrankLimSzEnd(lin0, lin1, i, dimC, rankC, rankA, size, sizeA);
|
||||
|
||||
C[i] = A[lin0] * B[lin1];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template __global__ void d_prodTensorEnd<float>(float* C, int* dimC, int rankC, size_t size, float* A, int* dimA, int rankA, size_t sizeA, float* B, int* dimB, int rankB);
|
||||
|
||||
__device__ void d_minReverse(int* dim, int& rank, const int* dim0, int rank0, const int* dim1, int rank1, bool& rev) {
|
||||
if (rank0 > rank1) {
|
||||
rank = rank1;
|
||||
for (int i = 0; i < rank1; ++i) dim[i] = dim1[i];
|
||||
rev = true;
|
||||
}
|
||||
else if (rank0 < rank1) {
|
||||
rank = rank0;
|
||||
for (int i = 0; i < rank1; ++i) dim[i] = dim0[i];
|
||||
rev = false;
|
||||
}
|
||||
else {// rank0 == rank1
|
||||
rank = rank0;
|
||||
for (int i = 0; i < rank0; i++) {
|
||||
if (dim[i] > dim1[rank1 - 1 - i]) dim[i] = dim1[rank1 - 1 - i];
|
||||
else dim[i] = dim0[i];
|
||||
}
|
||||
rev = false;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void d_reverseArray(int* arr, int sz) {
|
||||
int* tmp;
|
||||
//tmp = (int*)malloc(sz * sizeof(int));
|
||||
|
||||
tmp = new int[sz];
|
||||
if (tmp == NULL) {
|
||||
size_t limit = 0;
|
||||
cudaDeviceGetLimit(&limit, cudaLimitStackSize);
|
||||
printf("cudaLimitStackSize: %u | %d (%d) %d | \n", (unsigned)limit, blockIdx.x, blockDim.x, threadIdx.x);
|
||||
cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize);
|
||||
printf("cudaLimitPrintfFifoSize: %u | %d (%d) %d | \n", (unsigned)limit, blockIdx.x, blockDim.x, threadIdx.x);
|
||||
cudaDeviceGetLimit(&limit, cudaLimitMallocHeapSize);
|
||||
printf("cudaLimitMallocHeapSize: %u | %d (%d) %d | \n", (unsigned)limit, blockIdx.x, blockDim.x, threadIdx.x);
|
||||
|
||||
printf("error Allocation in tmp = (int*)malloc(sz * sizeof(int)); | | ");
|
||||
}int i = 0;
|
||||
for (; i < sz / 2; i++) {
|
||||
tmp[i] = arr[i];
|
||||
arr[i] = arr[sz - 1 - i];
|
||||
}
|
||||
for (; i < sz; i++) {
|
||||
arr[i] = tmp[sz - 1 - i];
|
||||
}
|
||||
//free(tmp);
|
||||
delete[]tmp;
|
||||
}
|
||||
|
||||
__device__ int d_min(int a, int b) {
|
||||
if (a < b) return a;
|
||||
return b;
|
||||
}
|
||||
|
||||
__device__ void d_concatArray(int* dst, int* src0, int* src1, int debDst, int debSrc0, int finSrc0, int debSrc1, int finSrc1) {
|
||||
int i = debDst;
|
||||
for (int j = debSrc0; j < finSrc0; j++) {
|
||||
dst[i++] = src0[j];
|
||||
}
|
||||
for (int j = debSrc1; j < finSrc1; j++) {
|
||||
dst[i++] = src1[j];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
__device__ void d_ConcatLinearToSplitSubrankLimSz(size_t& part0, size_t& part1, size_t lin, int* dim, int rank, int rankA, int rankB, size_t size, size_t sizeA, size_t sizeB, int* dM, int dMrank, size_t dMsize, int ind) {
|
||||
size_t sm = lin;
|
||||
size_t pp = size;
|
||||
size_t s = 0;
|
||||
size_t p = sizeA;
|
||||
//size_t sz_dA = sizeA / dMsize;
|
||||
int rankdA = rankA - dMrank;
|
||||
|
||||
int ret;
|
||||
int i;
|
||||
for (i = 0; i < rankdA; ++i) {
|
||||
pp /= dim[i];
|
||||
ret = sm / pp;
|
||||
p /= dim[i];
|
||||
s += ret * p;
|
||||
sm %= pp;
|
||||
}
|
||||
size_t s1 = 0;
|
||||
|
||||
size_t pb = sizeB / dMsize;
|
||||
for (; i < rank; ++i) {
|
||||
pp /= dim[i];
|
||||
ret = sm / pp;
|
||||
pb /= dim[i];
|
||||
s1 += ret * pb;
|
||||
sm %= pp;
|
||||
}
|
||||
|
||||
size_t smd = ind;
|
||||
size_t ppb = dMsize;
|
||||
//size_t pb = size / sz_dA;
|
||||
pb = sizeB;
|
||||
p = dMsize;
|
||||
for (int j = 0;j < dMrank;j++) {
|
||||
ppb /= dM[j];
|
||||
ret = smd / ppb;
|
||||
p /= dM[j];
|
||||
s += ret * p;
|
||||
pb /= dM[j];
|
||||
s1 += ret * pb;
|
||||
smd %= ppb;
|
||||
}
|
||||
//pp = size / sz_dA;
|
||||
part0 = s;
|
||||
part1 = s1;
|
||||
}
|
||||
|
||||
__device__ void d_SplitLineardToSubrank(size_t& part0, size_t& part1, size_t lin, int* dim, int rank, int rankA, int rankB, size_t size, size_t sizeA, size_t sizeB, int* dM, int dMrank, size_t dMsize) {
|
||||
size_t sm = lin;
|
||||
size_t pp = size;
|
||||
size_t s = 0;
|
||||
size_t p = sizeA;
|
||||
//size_t sz_dA = sizeA / dMsize;
|
||||
int rankdA = rankA - dMrank;
|
||||
|
||||
int ret;
|
||||
int i;
|
||||
for (i = 0; i < rankdA; ++i) {
|
||||
pp /= dim[i];
|
||||
ret = sm / pp;
|
||||
p /= dim[i];
|
||||
s += ret * p;
|
||||
sm %= pp;
|
||||
}
|
||||
size_t s1 = 0;
|
||||
|
||||
size_t pb = sizeB / dMsize;
|
||||
for (; i < rank; ++i) {
|
||||
pp /= dim[i];
|
||||
ret = sm / pp;
|
||||
pb /= dim[i];
|
||||
s1 += ret * pb;
|
||||
sm %= pp;
|
||||
}
|
||||
part0 = s;
|
||||
part1 = s1;
|
||||
}
|
||||
|
||||
|
||||
__device__ void d_UnionConcatLinearSplitedSubrank(size_t& part0, size_t& part1, size_t p0, size_t p1, size_t size, size_t sizeB, int* dM, int dMrank, size_t dMsize, int ind) {
|
||||
size_t s = p0;
|
||||
size_t s1 = p1;
|
||||
int ret;
|
||||
size_t smd = ind;
|
||||
size_t ppb = dMsize;
|
||||
//size_t pb = size / sz_dA;
|
||||
size_t pb = sizeB;
|
||||
size_t p = dMsize;
|
||||
for (int j = 0;j < dMrank;j++) {
|
||||
ppb /= dM[j];
|
||||
ret = smd / ppb;
|
||||
p /= dM[j];
|
||||
s += ret * p;
|
||||
pb /= dM[j];
|
||||
s1 += ret * pb;
|
||||
smd %= ppb;
|
||||
}
|
||||
//pp = size / sz_dA;
|
||||
part0 = s;
|
||||
part1 = s1;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void d_TensorContractnReverseProd(T* C, int* dimC, int rankC, size_t sizeC, T* A, int rankA, size_t sizeA, T* B, int rankB, size_t sizeB, int* dM, int dMrank, size_t dMsize) {
|
||||
|
||||
size_t p0, p1;
|
||||
size_t lin0, lin1;
|
||||
|
||||
|
||||
//size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t i = d_getGlobalIdx_1D_1D();
|
||||
|
||||
if (i < sizeC) {
|
||||
|
||||
d_SplitLineardToSubrank(p0, p1, i, dimC, rankC, rankA, rankB, sizeC, sizeA, sizeB, dM, dMrank, dMsize);
|
||||
|
||||
C[i] = 0;
|
||||
for (size_t k = 0; k < dMsize; k++) {
|
||||
|
||||
d_UnionConcatLinearSplitedSubrank(lin0, lin1, p0, p1, sizeC, sizeB, dM, dMrank, dMsize, k);
|
||||
|
||||
//d_ConcatLinearToSplitSubrankLimSz(lin0, lin1, i, dimC, rankC, rankA, rankB, sizeC, sizeA, sizeB, dM, dMrank, dMsize, k);
|
||||
|
||||
C[i] += A[lin0] * B[lin1];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template
|
||||
__global__ void d_TensorContractnReverseProd<float>(float* C, int* dimC, int rankC, size_t size, float* A, int rankA, size_t sizeA, float* B, int rankB, size_t sizeB, int* dM, int dMrank, size_t dMsize);
|
||||
|
||||
__device__ void d_LinearTransformCoord(size_t& dst, size_t src, int* inversePerm, size_t sizeA, int rankDst, int rankSrc, int* dDst, int* dSrc) {
|
||||
size_t sm = src;
|
||||
size_t pp = sizeA;
|
||||
size_t s = 0;
|
||||
size_t p = 1;
|
||||
int ret;// = new int[rank];
|
||||
int i, j;
|
||||
for (i = 0; i < rankSrc; ++i) {
|
||||
pp /= dSrc[i];
|
||||
ret = sm / pp;
|
||||
p = 1;
|
||||
for (j = inversePerm[i] + 1; j < rankDst;j++) {
|
||||
p *= dDst[j];
|
||||
}
|
||||
s += ret * p;
|
||||
|
||||
sm %= pp;
|
||||
|
||||
}
|
||||
dst = s;
|
||||
if (s > sizeA) printf("I have a problem in LinearTransformCoord: s:%ld siez:%ld \n", s, sizeA);
|
||||
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void d_PermLinearTransformCoord(T* C, int* dimC, int rankC, size_t sizeC, T* A, int* dimA, int rankA, size_t sizeA, int* invPerm) {
|
||||
|
||||
//size_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t i = d_getGlobalIdx_1D_1D();
|
||||
|
||||
if (i < sizeC) {
|
||||
//printf("<i:%*ld ", 3, i);
|
||||
|
||||
size_t img = 0;
|
||||
//printf("<i:%*ld, img:%*ld\n", 3, i, 3, img);
|
||||
d_LinearTransformCoord(img, i, invPerm, sizeA, rankC, rankA, dimC, dimA);
|
||||
//img = d_LinearTransformCoord(i, invPerm, sizeC, dimC, dimA, rankC);
|
||||
|
||||
if (img < sizeC)
|
||||
C[img] = A[i];
|
||||
else {
|
||||
printf("something wrong in device: i:%ld , s:%ld\n", i, img);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template
|
||||
__global__ void d_PermLinearTransformCoord<float>(float* C, int* dimC, int rankC, size_t size, float* A, int* dimA, int rankA, size_t sizeA, int* invPerm);
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
#ifndef __D_CUDA_TENSOR_H__
|
||||
#define __D_CUDA_TENSOR_H__
|
||||
|
||||
#include "cuda.h"
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
//#include "cuda_device_runtime_api.h"
|
||||
|
||||
//#include "/home/fanasina/progr_/ptens0neD/tensor/tensCuda/d_tensCuda.h"
|
||||
#include "tensor/tensCuda/d_tensCuda.h"
|
||||
|
||||
|
||||
//#1D grid of 1D blocks
|
||||
__device__ int d_getGlobalIdx_1D_1D();
|
||||
//#1D grid of 2D blocks
|
||||
__device__ int d_getGlobalIdx_1D_2D();
|
||||
//#1D grid of 3D blocks
|
||||
__device__ int d_getGlobalIdx_1D_3D();
|
||||
//#1D grid of 1D blocks
|
||||
__device__ int d_getGlobalIdx_2D_1D();
|
||||
//#1D grid of 2D blocks
|
||||
__device__ int d_getGlobalIdx_2D_2D();
|
||||
//2D grid of 3D blocks
|
||||
__device__ int d_getGlobalIdx_2D_3D();
|
||||
//#1D grid of 1D blocks
|
||||
__device__ int d_getGlobalIdx_3D_1D();
|
||||
//#1D grid of 2D blocks
|
||||
__device__ int d_getGlobalIdx_3D_2D();
|
||||
//#1D grid of 3D blocks
|
||||
__device__ int d_getGlobalIdx_3D_3D();
|
||||
|
||||
|
||||
|
||||
extern cudaError_t cudaDeviceGetLimit(size_t* pValue, enum cudaLimit limit);
|
||||
|
||||
|
||||
__device__ void d_LinearToCoordEnd(int* ret, size_t lin, int* dim, int rank, size_t size);
|
||||
|
||||
__device__ size_t d_CoordToLinearEnd(int* coo, int* dim, int rank);
|
||||
|
||||
__device__ size_t d_CoordToLinear(int* coo, int* dim, int rank);
|
||||
|
||||
|
||||
__device__ void d_LinearToCoord(int* ret, size_t lin, int* dim, int rank, size_t size);
|
||||
|
||||
__device__ void d_subArray(int* dst, int* src, int debDst, int finDst, int debSrc);
|
||||
|
||||
__device__ void d_minReverse(int* dim, int& rank, const int* dim0, int rank0, const int* dim1, int rank1, bool& rev);
|
||||
|
||||
__device__ void d_reverseArray(int* arr, int sz);
|
||||
|
||||
__device__ int d_min(int a, int b);
|
||||
|
||||
__device__ void d_concatArray(int* dst, int* src0, int* src1, int debDst, int debSrc0, int finSrc0, int debSrc1, int finSrc1);
|
||||
|
||||
|
||||
template<typename T>
|
||||
__global__ void d_prodTensor(T* C, int* dimC, int rankC, size_t size, T* A, int* dimA, int rankA, size_t sizeA, T* B, int* dimB, int rankB);
|
||||
|
||||
template<typename T>
|
||||
__global__ void d_prodTensorEnd(T* C, int* dimC, int rankC, size_t size, T* A, int* dimA, int rankA, size_t sizeA, T* B, int* dimB, int rankB);
|
||||
|
||||
template<typename T>
|
||||
__global__ void d_TensorContractnReverseProd(T* C, int* dimC, int rankC, size_t size, T* A, int rankA, size_t sizeA, T* B, int rankB, size_t sizeB, int* dM, int dMrank, size_t dMsize);
|
||||
|
||||
template<typename T>
|
||||
__global__ void d_PermLinearTransformCoord(T* C, int* dimC, int rankC, size_t sizeC, T* A, int* dimA, int rankA, size_t sizeA, int* invPerm);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,574 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
//#include "/home/fanasina/progr_/ptens0neD/tensor/tens0neD/tens0neD.h"
|
||||
|
||||
//#include "/home/fanasina/progr_/ptens0neD/tensor/tensCuda/tensCuda.h"
|
||||
#include "tensor/tensCuda/tensCuda.h"
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename T>
|
||||
void cudaTensorProd(Tensor<T>& M, const Tensor<T>& M0, const Tensor<T>& M1) {
|
||||
add(M.Dim, M0.Dim, M1.Dim);
|
||||
M.initTensor();
|
||||
|
||||
int* d_imM, * d_imM0, * d_imM1;
|
||||
cudaError_t errCu = cudaMalloc((void**)&d_imM, M.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM, M.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&d_imM0, M0.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM0, M0.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&d_imM1, M1.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM1, M1.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMemcpy(d_imM, M.Dim.dim, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM, M.Dim.dim, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(d_imM0, M0.Dim.dim, M0.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM0, M0.Dim.dim, M0.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(d_imM1, M1.Dim.dim, M1.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM1, M1.Dim.dim, M1.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
T* e, * e0, * e1;
|
||||
errCu = cudaMalloc((void**)&e, M.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e, M.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&e0, M0.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e0, M0.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&e1, M1.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e1, M1.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMemcpy(e0, M0.elements, M0.Dim.size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(e0, M0.elements, M0.Dim.size * sizeof(T), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(e1, M1.elements, M1.Dim.size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(e1, M1.elements, M1.Dim.size * sizeof(T), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
int BLOCKSIZE = 256;//1024;
|
||||
int DIMBLOCKS = (M.Dim.size + BLOCKSIZE - 1) / BLOCKSIZE;
|
||||
//int DIMBLOCKS = (M.Dim.size) / BLOCKSIZE;
|
||||
|
||||
d_prodTensor<T> << < DIMBLOCKS, BLOCKSIZE >> > (e, d_imM, M.Dim.rank, M.Dim.size, e0, d_imM0, M0.Dim.rank, M0.Dim.size, e1, d_imM1, M1.Dim.rank);
|
||||
|
||||
errCu = cudaMemcpy(M.elements, e, M.Dim.size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(M.elements, e, M.Dim.size * sizeof(T), cudaMemcpyDeviceToHost) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaFree(e);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(e0);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e0) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(e1);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e1) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM0);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM0) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM1);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM1) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//template void cudaTensorProd<double>(Tensor<double>& M, const Tensor<double>& M1, const Tensor<double>& M0);
|
||||
template void cudaTensorProd<float>(Tensor<float>& M, const Tensor<float>& M1, const Tensor<float>& M0);
|
||||
|
||||
|
||||
template<typename T>
|
||||
void cudaTensorProdEnd(Tensor<T>& M, const Tensor<T>& M0, const Tensor<T>& M1) {
|
||||
add(M.Dim, M0.Dim, M1.Dim);
|
||||
M.initTensor();
|
||||
|
||||
int* d_imM, * d_imM0, * d_imM1;
|
||||
cudaError_t errCu = cudaMalloc((void**)&d_imM, M.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM, M.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&d_imM0, M0.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM0, M0.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&d_imM1, M1.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM1, M1.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMemcpy(d_imM, M.Dim.dim, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM, M.Dim.dim, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(d_imM0, M0.Dim.dim, M0.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM0, M0.Dim.dim, M0.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(d_imM1, M1.Dim.dim, M1.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM1, M1.Dim.dim, M1.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
T* e, * e0, * e1;
|
||||
errCu = cudaMalloc((void**)&e, M.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e, M.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&e0, M0.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e0, M0.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&e1, M1.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e1, M1.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMemcpy(e0, M0.elements, M0.Dim.size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(e0, M0.elements, M0.Dim.size * sizeof(T), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(e1, M1.elements, M1.Dim.size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(e1, M1.elements, M1.Dim.size * sizeof(T), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
size_t BLOCKSIZE = 1024;
|
||||
size_t DIMBLOCKS = (M.Dim.size + BLOCKSIZE - 1) / BLOCKSIZE;
|
||||
|
||||
d_prodTensorEnd<T> << < DIMBLOCKS, BLOCKSIZE >> > (e, d_imM, M.Dim.rank, M.Dim.size, e0, d_imM0, M0.Dim.rank, M0.Dim.size, e1, d_imM1, M1.Dim.rank);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
errCu = cudaMemcpy(M.elements, e, M.Dim.size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(M.elements, e, M.Dim.size * sizeof(T), cudaMemcpyDeviceToHost) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaFree(e);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(e0);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e0) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(e1);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e1) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM0);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM0) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM1);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM1) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//template void cudaTensorProd<double>(Tensor<double>& M, const Tensor<double>& M1, const Tensor<double>& M0);
|
||||
template void cudaTensorProdEnd<float>(Tensor<float>& M, const Tensor<float>& M1, const Tensor<float>& M0);
|
||||
|
||||
|
||||
template<typename T>
|
||||
void cudapermuteTensor(Tensor<T>& M, const Tensor<T>& M0, permutation p) {
|
||||
if (p.size == M0.Dim.rank) {
|
||||
M.Dim.rank = M0.Dim.rank;
|
||||
M.Dim.size = M0.Dim.size;
|
||||
M.Dim.initDim();
|
||||
M.initTensor();
|
||||
|
||||
p.permute(M.Dim.dim, M0.Dim.dim);
|
||||
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
cudaEventRecord(start);
|
||||
|
||||
|
||||
int* d_imM, * d_imM0;
|
||||
cudaError_t errCu = cudaMalloc((void**)&d_imM, M.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM, M.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMalloc((void**)&d_imM0, M0.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM0, M0.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMemcpy(d_imM, M.Dim.dim, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM, M.Dim.dim, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMemcpy(d_imM0, M0.Dim.dim, M0.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM0, M0.Dim.dim, M0.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
|
||||
T* e, * e0;
|
||||
errCu = cudaMalloc((void**)&e, M.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e, M.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&e0, M0.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e0, M0.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
|
||||
errCu = cudaMemcpy(e0, M0.elements, M0.Dim.size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(e0, M0.elements, M0.Dim.size * sizeof(T), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
|
||||
size_t BLOCKSIZE = 256; //1024;//512;
|
||||
size_t DIMBLOCKS = (M.Dim.size + BLOCKSIZE - 1) / BLOCKSIZE;
|
||||
dim3 blckSZ, gridSZ;
|
||||
blckSZ.x = BLOCKSIZE;
|
||||
gridSZ.x = DIMBLOCKS;
|
||||
|
||||
int* invP, * d_invP;
|
||||
invP = (int*)malloc(M.Dim.rank * sizeof(int));
|
||||
inverseArray(invP, p.perm, M.Dim.rank);
|
||||
errCu = cudaMalloc((void**)&d_invP, M.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_invP, M.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMemcpy(d_invP, invP, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_invP, invP, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
//printf("size: %ld\n", M.Dim.size);
|
||||
|
||||
//d_prodTensorEnd<T> << < DIMBLOCKS, BLOCKSIZE >> > (e, d_imM, M.Dim.rank, M.Dim.size, e0, d_imM0, M0.Dim.rank, e1, d_imM1, M1.Dim.rank);
|
||||
//d_TensorContractnReverseProd<T> << < DIMBLOCKS, BLOCKSIZE >> > (e, d_imM, M.Dim.rank, M.Dim.size, d_imdM, dM.rank, dM.size, e0, d_imM0, M0.Dim.rank, e1, d_imM1, M1.Dim.rank, nestingDepth);
|
||||
//d_TensorContractnReverseProd<T> << < gridSZ, blckSZ, 0, 0 >> > (e, d_imM, M.Dim.rank, M.Dim.size, d_imdM, dM.rank, dM.size, e0, d_imM0, M0.Dim.rank, e1, d_imM1, M1.Dim.rank, nestingDepth);
|
||||
d_PermLinearTransformCoord<T> << < gridSZ, blckSZ, 0, 0 >> > (e, d_imM, M.Dim.rank, M.Dim.size, e0, d_imM0, M0.Dim.rank, M0.Dim.size, d_invP);
|
||||
//d_PermLinearTransformCoord<T> << < gridSZ, blckSZ, 0, 0 >> > (e, d_imM, M.Dim.rank, M.Dim.size, e0, d_imM0, M0.Dim.rank, M0.Dim.size, p.perm);
|
||||
//cudaDeviceSynchronize();
|
||||
|
||||
|
||||
errCu = cudaMemcpy(M.elements, e, M.Dim.size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(M.elements, e, M.Dim.size * sizeof(T), cudaMemcpyDeviceToHost) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaFree(e);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(e0);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e0) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaFree(d_imM);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM0);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM0) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
printf("ellaps time cuda permute tensor: %f ms\n", milliseconds);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template
|
||||
void cudapermuteTensor(Tensor<float>& M, const Tensor<float>& M0, permutation p);
|
||||
|
||||
|
||||
// strict match contract ! if no strict, we take the minimum
|
||||
template<typename T>
|
||||
void cudaTensorContractNestProd(Tensor<T>& M, const Tensor<T>& M0, const Tensor<T>& M11, int nestingDepth, bool strict) {
|
||||
|
||||
|
||||
int perm[M11.Dim.rank];
|
||||
struct Tensor<T> M1;
|
||||
if (scanPermuteMatchContractTensorfromSrcToDst(perm, M11, M0, nestingDepth)) {
|
||||
for (int i = 0; i < M11.Dim.rank; i++) printf(" %d[%d] ", i, perm[i]); printf(": last perm \n");
|
||||
struct permutation p(M11.Dim.rank, perm);
|
||||
permuteTensor(M1, M11, p);
|
||||
M1.Dim.print();
|
||||
|
||||
}
|
||||
else {
|
||||
printf("Failed in Deep = %d\n", nestingDepth);
|
||||
//throw std::check_ProdTensor(" Failed imbrication order in Multiplication matrix ");
|
||||
|
||||
throw std::invalid_argument(" Failed imbrication order in Multiplication matrix ");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
cudaEventRecord(start);
|
||||
|
||||
int len0 = M0.Dim.rank - nestingDepth;
|
||||
int len1 = M1.Dim.rank - nestingDepth;
|
||||
|
||||
int* tsub0 = new int[len0];
|
||||
int* tsub1 = new int[len1];
|
||||
int* tDk1 = new int[nestingDepth];
|
||||
int* tDk0 = new int[nestingDepth];
|
||||
subArray(tsub0, M0.Dim.dim, 0, len0, 0);
|
||||
subArray(tsub1, M1.Dim.dim, 0, len1, nestingDepth);
|
||||
subArray(tDk1, M1.Dim.dim, 0, nestingDepth, 0);
|
||||
subArray(tDk0, M0.Dim.dim, 0, nestingDepth, len0);
|
||||
|
||||
dimension dSub0(len0, tsub0);
|
||||
dimension dSub1(len1, tsub1);
|
||||
dimension dM1(nestingDepth, tDk1);
|
||||
dimension dM0(nestingDepth, tDk0);
|
||||
dimension dM(dM0);
|
||||
//bool rev;
|
||||
//minReverse(dM, dM0, dM1, rev);
|
||||
//if (rev) reverseArray(dM.dim, dM.rank);
|
||||
//max(dM, dM0, dM1);
|
||||
|
||||
add(M.Dim, dSub0, dSub1);
|
||||
M.initTensor();
|
||||
|
||||
|
||||
|
||||
int* d_imM, * d_imM0, * d_imM1, * d_imdM;
|
||||
cudaError_t errCu = cudaMalloc((void**)&d_imM, M.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM, M.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&d_imdM, dM.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imdM, dM.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&d_imM0, M0.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM0, M0.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&d_imM1, M1.Dim.rank * sizeof(int));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&d_imM1, M1.Dim.rank * sizeof(int)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMemcpy(d_imM, M.Dim.dim, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM, M.Dim.dim, M.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(d_imdM, dM.dim, dM.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imdM, dM.dim, dM.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(d_imM0, M0.Dim.dim, M0.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM0, M0.Dim.dim, M0.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(d_imM1, M1.Dim.dim, M1.Dim.rank * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(d_imM1, M1.Dim.dim, M1.Dim.rank * sizeof(int), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
T* e, * e0, * e1;
|
||||
errCu = cudaMalloc((void**)&e, M.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e, M.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&e0, M0.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e0, M0.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMalloc((void**)&e1, M1.Dim.size * sizeof(T));
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMalloc((void**)&e1, M1.Dim.size * sizeof(T)) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaMemcpy(e0, M0.elements, M0.Dim.size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(e0, M0.elements, M0.Dim.size * sizeof(T), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaMemcpy(e1, M1.elements, M1.Dim.size * sizeof(T), cudaMemcpyHostToDevice);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(e1, M1.elements, M1.Dim.size * sizeof(T), cudaMemcpyHostToDevice) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
size_t BLOCKSIZE = 256; //1024;//512;
|
||||
size_t DIMBLOCKS = (M.Dim.size + BLOCKSIZE - 1) / BLOCKSIZE;
|
||||
dim3 blckSZ, gridSZ;
|
||||
blckSZ.x = BLOCKSIZE;
|
||||
gridSZ.x = DIMBLOCKS;
|
||||
|
||||
|
||||
//d_prodTensorEnd<T> << < DIMBLOCKS, BLOCKSIZE >> > (e, d_imM, M.Dim.rank, M.Dim.size, e0, d_imM0, M0.Dim.rank, e1, d_imM1, M1.Dim.rank);
|
||||
//d_TensorContractnReverseProd<T> << < DIMBLOCKS, BLOCKSIZE >> > (e, d_imM, M.Dim.rank, M.Dim.size, d_imdM, dM.rank, dM.size, e0, d_imM0, M0.Dim.rank, e1, d_imM1, M1.Dim.rank, nestingDepth);
|
||||
//d_TensorContractnReverseProd<T> << < gridSZ, blckSZ, 0, 0 >> > (e, d_imM, M.Dim.rank, M.Dim.size, d_imdM, dM.rank, dM.size, e0, d_imM0, M0.Dim.rank, e1, d_imM1, M1.Dim.rank, nestingDepth);
|
||||
d_TensorContractnReverseProd<T> << < gridSZ, blckSZ, 0, 0 >> > (e, d_imM, M.Dim.rank, M.Dim.size, e0, M0.Dim.rank, M0.Dim.size, e1, M1.Dim.rank, M1.Dim.size, d_imdM, dM.rank, dM.size);
|
||||
|
||||
//cudaDeviceSynchronize();
|
||||
|
||||
|
||||
errCu = cudaMemcpy(M.elements, e, M.Dim.size * sizeof(T), cudaMemcpyDeviceToHost);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaMemcpy(M.elements, e, M.Dim.size * sizeof(T), cudaMemcpyDeviceToHost) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
|
||||
errCu = cudaFree(e);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(e0);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e0) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(e1);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(e1) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM0);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM0) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
errCu = cudaFree(d_imM1);
|
||||
if (cudaSuccess != errCu) {
|
||||
printf("device fnc failed cudaFree(d_imM1) \n ErrorCuda: %d : %s\n", errCu, cudaGetErrorString(errCu));
|
||||
exit(errCu);
|
||||
}
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
printf("ellaps time cuda prod contract prod: %f ms\n", milliseconds);
|
||||
|
||||
|
||||
}
|
||||
|
||||
template
|
||||
void cudaTensorContractNestProd<float>(Tensor<float>& M, const Tensor<float>& M0, const Tensor<float>& M1, int nestingDepth, bool strict);
|
||||
//template void cudaTensorContractnReverseProd<double>(Tensor<double>& M, const Tensor<double>& M0, const Tensor<double>& M1, int nestingDepth);
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
#ifndef __TENS_CUDA_H__
|
||||
#define __TENS_CUDA_H__
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
//#include "/home/fanasina/progr_/ptens0neD/tensor/tens0neD/tens0neD.h"
|
||||
#include "tensor/tens0neD/tens0neD.h"
|
||||
|
||||
//#include "/home/fanasina/progr_/ptens0neD/tensor/tensCuda/d_tensCuda.h"
|
||||
#include "tensor/tensCuda/d_tensCuda.h"
|
||||
//#include "dimension/dimension.h"
|
||||
|
||||
template<typename T>
|
||||
struct Tensor;
|
||||
|
||||
template<typename T>
|
||||
void cudaTensorContractNestProd(Tensor<T>& M, const Tensor<T>& M0, const Tensor<T>& M1, int nestingDepth, bool strict = true);
|
||||
|
||||
template<typename T>
|
||||
void cudaTensorProd(Tensor<T>& M, const Tensor<T>& M0, const Tensor<T>& M1);
|
||||
template<typename T>
|
||||
void cudaTensorProdEnd(Tensor<T>& M, const Tensor<T>& M0, const Tensor<T>& M1);
|
||||
template<typename T>
|
||||
void cudapermuteTensor(Tensor<T>& M, const Tensor<T>& M0, permutation p);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user