Actual source code: ex2.c

petsc-3.13.4 2020-08-01
Report Typos and Errors
  1: static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n";
  2: /*
  3:   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
  4:   operations in the default stream and does not sync these operations since it assumes routines consume
  5:   the destination data are also on the default stream. However, when destination data in on CPU,
  6:   SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
  7:  */

  9:  #include <petscvec.h>
 10: int main(int argc,char **argv)
 11: {
 12:   PetscErrorCode     ierr;
 13:   PetscInt           i,n=100000; /* Big enough to make the asynchronous copy meaningful */
 14:   PetscScalar        *val;
 15:   const PetscScalar  *yval;
 16:   Vec                x,y;
 17:   PetscMPIInt        size;
 18:   IS                 ix,iy;
 19:   VecScatter         vscat;

 22:   PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr;
 23:   MPI_Comm_size(PETSC_COMM_WORLD,&size);
 24:   if (size != 1) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test\n");

 26:   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
 27:      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
 28:      cudaMemcpyDeviceToHost.
 29:    */
 30:   VecCreateSeq(PETSC_COMM_SELF,n,&x);
 31:   VecSetType(x,VECSEQCUDA);
 32:   VecCreateSeq(PETSC_COMM_SELF,n,&y);
 33:   VecSetType(y,VECSEQCUDA);

 35:   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
 36:   VecGetArray(x,&val);
 37:   for (i=0; i<n; i++) val[i] = i/2.0;
 38:   VecRestoreArray(x,&val);
 39:   VecScale(x,2.0);
 40:   VecSet(y,314);

 42:   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
 43:   VecGetArray(y,&val);
 44:   VecRestoreArray(y,&val);

 46:   /* The vscat is simply a vector copy */
 47:   ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix);
 48:   ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy);
 49:   VecScatterCreate(x,ix,y,iy,&vscat);

 51:   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
 52:      cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
 53:    */
 54:   VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
 55:   VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
 56:   VecGetArrayRead(y,&yval);
 57:   /* Display the first and the last entries of y to see if it is valid on host */
 58:   PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%D] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));
 59:   VecRestoreArrayRead(y,&yval);

 61:   VecDestroy(&x);
 62:   VecDestroy(&y);
 63:   ISDestroy(&ix);
 64:   ISDestroy(&iy);
 65:   VecScatterDestroy(&vscat);
 66:   PetscFinalize();
 67:   return ierr;
 68: }

 70: /*TEST

 72:    test:
 73:     requires: cuda
 74:     #make sure the host memory is pinned
 75:     args: -vec_pinned_memory_min 0

 77: TEST*/