Actual source code: bvorthogcuda.cu

slepc-3.12.2 2020-01-13
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV orthogonalization routines (CUDA)
 12: */

 14: #include <slepc/private/bvimpl.h>          /*I   "slepcbv.h"   I*/
 15: #include <slepcblaslapack.h>
 16: #include <petsccublas.h>

 18: /*
 19:    BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
 20: */
 21: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
 22: {
 24:   PetscScalar    *d_hh,*d_a;
 25:   PetscInt       i;

 28:   if (!h) {
 29:     VecCUDAGetArray(bv->buffer,&d_a);
 30:     d_hh = d_a + j*(bv->nc+bv->m);
 31:     cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));
 32:     WaitForGPU();CHKERRCUDA(ierr);
 33:     VecCUDARestoreArray(bv->buffer,&d_a);
 34:   } else { /* cpu memory */
 35:     for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
 36:   }
 37:   return(0);
 38: }

 40: /*
 41:    BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
 42:    into column j of the bv buffer
 43:  */
 44: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
 45: {
 47:   PetscScalar    *d_h,*d_c,sone=1.0;
 48:   PetscInt       i;
 49:   PetscBLASInt   idx,one=1;
 50:   cublasStatus_t cberr;
 51:   cublasHandle_t cublasv2handle;

 54:   if (!h) {
 55:     PetscCUBLASGetHandle(&cublasv2handle);
 56:     VecCUDAGetArray(bv->buffer,&d_c);
 57:     d_h = d_c + j*(bv->nc+bv->m);
 58:     PetscBLASIntCast(bv->nc+j,&idx);
 59:     PetscLogGpuTimeBegin();
 60:     cberr = cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one);CHKERRCUBLAS(cberr);
 61:     PetscLogGpuTimeEnd();
 62:     PetscLogGpuFlops(1.0*bv->nc+j);
 63:     WaitForGPU();CHKERRCUDA(ierr);
 64:     VecCUDARestoreArray(bv->buffer,&d_c);
 65:   } else { /* cpu memory */
 66:     for (i=0;i<bv->nc+j;i++) h[i] += c[i];
 67:     PetscLogFlops(1.0*bv->nc+j);
 68:   }
 69:   return(0);
 70: }

 72: /*
 73:    BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
 74:    of the coefficients array
 75: */
 76: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
 77: {
 79:   PetscScalar    *d_h,*a;
 80:   cudaError_t    cerr;

 83:   if (!h) {
 84:     VecCUDAGetArray(bv->buffer,&a);
 85:     d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
 86:     cerr = cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 87:     PetscLogCpuToGpu(sizeof(PetscScalar));
 88:     WaitForGPU();CHKERRCUDA(ierr);
 89:     VecCUDARestoreArray(bv->buffer,&a);
 90:   } else { /* cpu memory */
 91:     h[bv->nc+j] = value;
 92:   }
 93:   return(0);
 94: }

 96: /*
 97:    BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
 98:    coefficients array (up to position j)
 99: */
100: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
101: {
102:   PetscErrorCode    ierr;
103:   const PetscScalar *d_h;
104:   PetscScalar       dot;
105:   PetscInt          i;
106:   PetscBLASInt      idx,one=1;
107:   cublasStatus_t    cberr;
108:   cublasHandle_t    cublasv2handle;

111:   if (!h) {
112:     PetscCUBLASGetHandle(&cublasv2handle);
113:     VecCUDAGetArrayRead(bv->buffer,&d_h);
114:     PetscBLASIntCast(bv->nc+j,&idx);
115:     PetscLogGpuTimeBegin();
116:     cberr = cublasXdotc(cublasv2handle,idx,d_h,one,d_h,one,&dot);CHKERRCUBLAS(cberr);
117:     PetscLogGpuTimeEnd();
118:     PetscLogGpuFlops(2.0*bv->nc+j);
119:     WaitForGPU();CHKERRCUDA(ierr);
120:     *sum = PetscRealPart(dot);
121:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
122:   } else { /* cpu memory */
123:     *sum = 0.0;
124:     for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
125:     PetscLogFlops(2.0*bv->nc+j);
126:   }
127:   return(0);
128: }

130: #define X_AXIS        0
131: #define BLOCK_SIZE_X 64
132: #define TILE_SIZE_X  16 /* work to be done by any thread on axis x */

134: /*
135:    Set the kernels grid dimensions
136:    xcount: number of kernel calls needed for the requested size
137:  */
138: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
139: {
140:   PetscInt              one=1;
141:   PetscBLASInt          card;
142:   struct cudaDeviceProp devprop;
143:   cudaError_t           cerr;

146:   *xcount = 1;
147:   if (n>BLOCK_SIZE_X) {
148:     dimBlock->x = BLOCK_SIZE_X;
149:     dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
150:   } else {
151:     dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
152:     dimGrid->x = one;
153:   }
154:   cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
155:   cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
156:   if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
157:     *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
158:     dimGrid->x = devprop.maxGridSize[X_AXIS];
159:   }
160:   return(0);
161: }

163: /* pointwise multiplication */
164: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
165: {
166:   PetscInt i,x;

168:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
169:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
170:     a[i] *= PetscRealPart(b[i]);
171:   }
172: }

174: /* pointwise division */
175: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
176: {
177:   PetscInt i,x;

179:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
180:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
181:     a[i] /= PetscRealPart(b[i]);
182:   }
183: }

185: /*
186:    BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
187:    the contents of the coefficients array (up to position j) and omega is the signature;
188:    if inverse=TRUE then the operation is h/omega
189: */
190: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
191: {
192:   PetscErrorCode    ierr;
193:   PetscScalar       *d_h;
194:   const PetscScalar *d_omega,*omega;
195:   PetscInt          i,xcount;
196:   dim3              blocks3d, threads3d;
197:   cudaError_t       cerr;

200:   if (!(bv->nc+j)) return(0);
201:   if (!h) {
202:     VecCUDAGetArray(bv->buffer,&d_h);
203:     VecCUDAGetArrayRead(bv->omega,&d_omega);
204:     SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
205:     PetscLogGpuTimeBegin();
206:     if (inverse) {
207:       for (i=0;i<xcount;i++) {
208:         PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
209:       }
210:     } else {
211:       for (i=0;i<xcount;i++) {
212:         PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
213:       }
214:     }
215:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
216:     PetscLogGpuTimeEnd();
217:     PetscLogGpuFlops(1.0*bv->nc+j);
218:     WaitForGPU();CHKERRCUDA(ierr);
219:     VecCUDARestoreArrayRead(bv->omega,&d_omega);
220:     VecCUDARestoreArray(bv->buffer,&d_h);
221:   } else {
222:     VecGetArrayRead(bv->omega,&omega);
223:     if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
224:     else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
225:     VecRestoreArrayRead(bv->omega,&omega);
226:     PetscLogFlops(1.0*bv->nc+j);
227:   }
228:   return(0);
229: }

231: /*
232:    BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
233:    of the coefficients array
234: */
235: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
236: {
237:   PetscErrorCode    ierr;
238:   const PetscScalar *d_h;
239:   PetscScalar       hh;
240:   cudaError_t       cerr;

243:   if (!h) {
244:     VecCUDAGetArrayRead(bv->buffer,&d_h);
245:     cerr = cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
246:     PetscLogGpuToCpu(sizeof(PetscScalar));
247:     WaitForGPU();CHKERRCUDA(ierr);
248:     BV_SafeSqrt(bv,hh,beta);
249:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
250:   } else {
251:     BV_SafeSqrt(bv,h[bv->nc+j],beta);
252:   }
253:   return(0);
254: }

256: /*
257:    BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
258:    provided by the caller (only values from l to j are copied)
259: */
260: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
261: {
262:   PetscErrorCode    ierr;
263:   const PetscScalar *d_h,*d_a;
264:   PetscInt          i;
265:   cudaError_t       cerr;

268:   if (!h) {
269:     VecCUDAGetArrayRead(bv->buffer,&d_a);
270:     d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
271:     cerr = cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
272:     PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar));
273:     WaitForGPU();CHKERRCUDA(ierr);
274:     VecCUDARestoreArrayRead(bv->buffer,&d_a);
275:   } else {
276:     for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
277:   }
278:   return(0);
279: }