Actual source code: ex2.c
petsc-3.13.4 2020-08-01
1: static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n";
2: /*
3: SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4: operations in the default stream and does not sync these operations since it assumes routines consume
5: the destination data are also on the default stream. However, when destination data in on CPU,
6: SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
7: */
9: #include <petscvec.h>
10: int main(int argc,char **argv)
11: {
12: PetscErrorCode ierr;
13: PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */
14: PetscScalar *val;
15: const PetscScalar *yval;
16: Vec x,y;
17: PetscMPIInt size;
18: IS ix,iy;
19: VecScatter vscat;
22: PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr;
23: MPI_Comm_size(PETSC_COMM_WORLD,&size);
24: if (size != 1) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test\n");
26: /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
27: since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
28: cudaMemcpyDeviceToHost.
29: */
30: VecCreateSeq(PETSC_COMM_SELF,n,&x);
31: VecSetType(x,VECSEQCUDA);
32: VecCreateSeq(PETSC_COMM_SELF,n,&y);
33: VecSetType(y,VECSEQCUDA);
35: /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
36: VecGetArray(x,&val);
37: for (i=0; i<n; i++) val[i] = i/2.0;
38: VecRestoreArray(x,&val);
39: VecScale(x,2.0);
40: VecSet(y,314);
42: /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
43: VecGetArray(y,&val);
44: VecRestoreArray(y,&val);
46: /* The vscat is simply a vector copy */
47: ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix);
48: ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy);
49: VecScatterCreate(x,ix,y,iy,&vscat);
51: /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
52: cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
53: */
54: VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
55: VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
56: VecGetArrayRead(y,&yval);
57: /* Display the first and the last entries of y to see if it is valid on host */
58: PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%D] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));
59: VecRestoreArrayRead(y,&yval);
61: VecDestroy(&x);
62: VecDestroy(&y);
63: ISDestroy(&ix);
64: ISDestroy(&iy);
65: VecScatterDestroy(&vscat);
66: PetscFinalize();
67: return ierr;
68: }
70: /*TEST
72: test:
73: requires: cuda
74: #make sure the host memory is pinned
75: args: -vec_pinned_memory_min 0
77: TEST*/