Actual source code: sfpack.c
1: #include "petsc/private/sfimpl.h"
2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5: /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
7: /*
8: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9: * therefore we pack data types manually. This file defines packing routines for the standard data types.
10: */
12: #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d
14: /* Operations working like s += t */
15: #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */
16: #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */
17: #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */
18: #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0)
19: /* Ref MPI MAXLOC */
20: #define OP_XLOC(op,s,t) \
21: do { \
22: if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23: else if (!((s).u op (t).u)) s = t; \
24: } while (0)
26: /* DEF_PackFunc - macro defining a Pack routine
28: Arguments of the macro:
29: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30: .BS Block size for vectorization. It is a factor of bsz.
31: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
33: Arguments of the Pack routine:
34: +count Number of indices in idx[].
35: .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36: .opt Per-pack optimization plan. NULL means no such plan.
37: .idx Indices of entries to packed.
38: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40: -packed Address of the packed data.
41: */
42: #define DEF_PackFunc(Type,BS,EQ) \
43: static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44: { \
46: const Type *u = (const Type*)unpacked,*u2; \
47: Type *p = (Type*)packed,*p2; \
48: PetscInt i,j,k,X,Y,r,bs=link->bs; \
49: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
50: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
52: if (!idx) {PetscArraycpy(p,u+start*MBS,MBS*count);}/* idx[] are contiguous */ \
53: else if (opt) { /* has optimizations available */ \
54: p2 = p; \
55: for (r=0; r<opt->n; r++) { \
56: u2 = u + opt->start[r]*MBS; \
57: X = opt->X[r]; \
58: Y = opt->Y[r]; \
59: for (k=0; k<opt->dz[r]; k++) \
60: for (j=0; j<opt->dy[r]; j++) { \
61: PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS); \
62: p2 += opt->dx[r]*MBS; \
63: } \
64: } \
65: } else { \
66: for (i=0; i<count; i++) \
67: for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
68: for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
69: p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \
70: } \
71: return(0); \
72: }
74: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75: and inserts into a sparse array.
77: Arguments:
78: .Type Type of the data
79: .BS Block size for vectorization
80: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
82: Notes:
83: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
84: */
85: #define DEF_UnpackFunc(Type,BS,EQ) \
86: static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87: { \
89: Type *u = (Type*)unpacked,*u2; \
90: const Type *p = (const Type*)packed; \
91: PetscInt i,j,k,X,Y,r,bs=link->bs; \
92: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
93: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
95: if (!idx) { \
96: u += start*MBS; \
97: if (u != p) {PetscArraycpy(u,p,count*MBS);} \
98: } else if (opt) { /* has optimizations available */ \
99: for (r=0; r<opt->n; r++) { \
100: u2 = u + opt->start[r]*MBS; \
101: X = opt->X[r]; \
102: Y = opt->Y[r]; \
103: for (k=0; k<opt->dz[r]; k++) \
104: for (j=0; j<opt->dy[r]; j++) { \
105: PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS); \
106: p += opt->dx[r]*MBS; \
107: } \
108: } \
109: } else { \
110: for (i=0; i<count; i++) \
111: for (j=0; j<M; j++) \
112: for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \
113: } \
114: return(0); \
115: }
117: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
119: Arguments:
120: +Opname Name of the Op, such as Add, Mult, LAND, etc.
121: .Type Type of the data
122: .BS Block size for vectorization
123: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
124: .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125: .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
126: */
127: #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128: static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129: { \
130: Type *u = (Type*)unpacked,*u2; \
131: const Type *p = (const Type*)packed; \
132: PetscInt i,j,k,X,Y,r,bs=link->bs; \
133: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
134: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
136: if (!idx) { \
137: u += start*MBS; \
138: for (i=0; i<count; i++) \
139: for (j=0; j<M; j++) \
140: for (k=0; k<BS; k++) \
141: OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
142: } else if (opt) { /* idx[] has patterns */ \
143: for (r=0; r<opt->n; r++) { \
144: u2 = u + opt->start[r]*MBS; \
145: X = opt->X[r]; \
146: Y = opt->Y[r]; \
147: for (k=0; k<opt->dz[r]; k++) \
148: for (j=0; j<opt->dy[r]; j++) { \
149: for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \
150: p += opt->dx[r]*MBS; \
151: } \
152: } \
153: } else { \
154: for (i=0; i<count; i++) \
155: for (j=0; j<M; j++) \
156: for (k=0; k<BS; k++) \
157: OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
158: } \
159: return(0); \
160: }
162: #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163: static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164: { \
165: Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \
166: PetscInt i,j,k,r,l,bs=link->bs; \
167: const PetscInt M = (EQ) ? 1 : bs/BS; \
168: const PetscInt MBS = M*BS; \
170: for (i=0; i<count; i++) { \
171: r = (!idx ? start+i : idx[i])*MBS; \
172: l = i*MBS; \
173: for (j=0; j<M; j++) \
174: for (k=0; k<BS; k++) { \
175: tmp = u[r+j*BS+k]; \
176: OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \
177: p[l+j*BS+k] = tmp; \
178: } \
179: } \
180: return(0); \
181: }
183: #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184: static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185: { \
187: const Type *u = (const Type*)src; \
188: Type *v = (Type*)dst; \
189: PetscInt i,j,k,s,t,X,Y,bs = link->bs; \
190: const PetscInt M = (EQ) ? 1 : bs/BS; \
191: const PetscInt MBS = M*BS; \
193: if (!srcIdx) { /* src is contiguous */ \
194: u += srcStart*MBS; \
195: CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u); \
196: } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
197: u += srcOpt->start[0]*MBS; \
198: v += dstStart*MBS; \
199: X = srcOpt->X[0]; Y = srcOpt->Y[0]; \
200: for (k=0; k<srcOpt->dz[0]; k++) \
201: for (j=0; j<srcOpt->dy[0]; j++) { \
202: for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \
203: v += srcOpt->dx[0]*MBS; \
204: } \
205: } else { /* all other cases */ \
206: for (i=0; i<count; i++) { \
207: s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \
208: t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \
209: for (j=0; j<M; j++) \
210: for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \
211: } \
212: } \
213: return(0); \
214: }
216: #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217: static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218: { \
219: Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \
220: const Type *ldata = (const Type*)leafdata; \
221: PetscInt i,j,k,r,l,bs = link->bs; \
222: const PetscInt M = (EQ) ? 1 : bs/BS; \
223: const PetscInt MBS = M*BS; \
225: for (i=0; i<count; i++) { \
226: r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \
227: l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \
228: for (j=0; j<M; j++) \
229: for (k=0; k<BS; k++) { \
230: lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \
231: OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \
232: } \
233: } \
234: return(0); \
235: }
237: /* Pack, Unpack/Fetch ops */
238: #define DEF_Pack(Type,BS,EQ) \
239: DEF_PackFunc(Type,BS,EQ) \
240: DEF_UnpackFunc(Type,BS,EQ) \
241: DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \
242: static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \
243: link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \
244: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \
245: link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \
246: }
248: /* Add, Mult ops */
249: #define DEF_Add(Type,BS,EQ) \
250: DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
251: DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
252: DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
253: DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
254: DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
255: DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \
256: static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \
257: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \
258: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \
259: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \
260: link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \
261: link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \
262: link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \
263: }
265: /* Max, Min ops */
266: #define DEF_Cmp(Type,BS,EQ) \
267: DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
268: DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
269: DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
270: DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
271: static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \
272: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
273: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
274: link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \
275: link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \
276: }
278: /* Logical ops.
279: The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
280: the compilation warning "empty macro arguments are undefined in ISO C90"
281: */
282: #define DEF_Log(Type,BS,EQ) \
283: DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \
284: DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \
285: DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \
286: DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \
287: DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \
288: DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \
289: static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \
290: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \
291: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \
292: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \
293: link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \
294: link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \
295: link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \
296: }
298: /* Bitwise ops */
299: #define DEF_Bit(Type,BS,EQ) \
300: DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \
301: DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \
302: DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \
303: DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \
304: DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \
305: DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \
306: static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \
307: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \
308: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \
309: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \
310: link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \
311: link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \
312: link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \
313: }
315: /* Maxloc, Minloc ops */
316: #define DEF_Xloc(Type,BS,EQ) \
317: DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \
318: DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \
319: DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \
320: DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \
321: static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \
322: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
323: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
324: link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \
325: link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \
326: }
328: #define DEF_IntegerType(Type,BS,EQ) \
329: DEF_Pack(Type,BS,EQ) \
330: DEF_Add(Type,BS,EQ) \
331: DEF_Cmp(Type,BS,EQ) \
332: DEF_Log(Type,BS,EQ) \
333: DEF_Bit(Type,BS,EQ) \
334: static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \
335: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
336: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
337: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
338: CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \
339: CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \
340: }
342: #define DEF_RealType(Type,BS,EQ) \
343: DEF_Pack(Type,BS,EQ) \
344: DEF_Add(Type,BS,EQ) \
345: DEF_Cmp(Type,BS,EQ) \
346: static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \
347: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
348: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
349: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
350: }
352: #if defined(PETSC_HAVE_COMPLEX)
353: #define DEF_ComplexType(Type,BS,EQ) \
354: DEF_Pack(Type,BS,EQ) \
355: DEF_Add(Type,BS,EQ) \
356: static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \
357: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
358: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
359: }
360: #endif
362: #define DEF_DumbType(Type,BS,EQ) \
363: DEF_Pack(Type,BS,EQ) \
364: static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \
365: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
366: }
368: /* Maxloc, Minloc */
369: #define DEF_PairType(Type,BS,EQ) \
370: DEF_Pack(Type,BS,EQ) \
371: DEF_Xloc(Type,BS,EQ) \
372: static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \
373: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
374: CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \
375: }
377: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */
378: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
386: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387: DEF_IntegerType(int,1,1)
388: DEF_IntegerType(int,2,1)
389: DEF_IntegerType(int,4,1)
390: DEF_IntegerType(int,8,1)
391: DEF_IntegerType(int,1,0)
392: DEF_IntegerType(int,2,0)
393: DEF_IntegerType(int,4,0)
394: DEF_IntegerType(int,8,0)
395: #endif
397: /* The typedefs are used to get a typename without space that CPPJoin can handle */
398: typedef signed char SignedChar;
399: DEF_IntegerType(SignedChar,1,1)
400: DEF_IntegerType(SignedChar,2,1)
401: DEF_IntegerType(SignedChar,4,1)
402: DEF_IntegerType(SignedChar,8,1)
403: DEF_IntegerType(SignedChar,1,0)
404: DEF_IntegerType(SignedChar,2,0)
405: DEF_IntegerType(SignedChar,4,0)
406: DEF_IntegerType(SignedChar,8,0)
408: typedef unsigned char UnsignedChar;
409: DEF_IntegerType(UnsignedChar,1,1)
410: DEF_IntegerType(UnsignedChar,2,1)
411: DEF_IntegerType(UnsignedChar,4,1)
412: DEF_IntegerType(UnsignedChar,8,1)
413: DEF_IntegerType(UnsignedChar,1,0)
414: DEF_IntegerType(UnsignedChar,2,0)
415: DEF_IntegerType(UnsignedChar,4,0)
416: DEF_IntegerType(UnsignedChar,8,0)
418: DEF_RealType(PetscReal,1,1)
419: DEF_RealType(PetscReal,2,1)
420: DEF_RealType(PetscReal,4,1)
421: DEF_RealType(PetscReal,8,1)
422: DEF_RealType(PetscReal,1,0)
423: DEF_RealType(PetscReal,2,0)
424: DEF_RealType(PetscReal,4,0)
425: DEF_RealType(PetscReal,8,0)
427: #if defined(PETSC_HAVE_COMPLEX)
428: DEF_ComplexType(PetscComplex,1,1)
429: DEF_ComplexType(PetscComplex,2,1)
430: DEF_ComplexType(PetscComplex,4,1)
431: DEF_ComplexType(PetscComplex,8,1)
432: DEF_ComplexType(PetscComplex,1,0)
433: DEF_ComplexType(PetscComplex,2,0)
434: DEF_ComplexType(PetscComplex,4,0)
435: DEF_ComplexType(PetscComplex,8,0)
436: #endif
438: #define PairType(Type1,Type2) Type1##_##Type2
439: typedef struct {int u; int i;} PairType(int,int);
440: typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441: DEF_PairType(PairType(int,int),1,1)
442: DEF_PairType(PairType(PetscInt,PetscInt),1,1)
444: /* If we don't know the basic type, we treat it as a stream of chars or ints */
445: DEF_DumbType(char,1,1)
446: DEF_DumbType(char,2,1)
447: DEF_DumbType(char,4,1)
448: DEF_DumbType(char,1,0)
449: DEF_DumbType(char,2,0)
450: DEF_DumbType(char,4,0)
452: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453: DEF_DumbType(DumbInt,1,1)
454: DEF_DumbType(DumbInt,2,1)
455: DEF_DumbType(DumbInt,4,1)
456: DEF_DumbType(DumbInt,8,1)
457: DEF_DumbType(DumbInt,1,0)
458: DEF_DumbType(DumbInt,2,0)
459: DEF_DumbType(DumbInt,4,0)
460: DEF_DumbType(DumbInt,8,0)
462: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
463: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
464: {
465: int ierr;
466: MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
467: MPI_Type_commit(newtype); if (ierr) return ierr;
468: return MPI_SUCCESS;
469: }
470: #endif
472: PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
473: {
474: PetscErrorCode ierr;
475: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
476: PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
479: /* Destroy device-specific fields */
480: if (link->deviceinited) {(*link->Destroy)(sf,link);}
482: /* Destroy host related fields */
483: if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
484: if (!link->use_nvshmem) {
485: for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
486: if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
487: }
488: PetscFree(link->reqs);
489: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
490: PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
491: PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
492: }
493: }
494: PetscFree(link);
495: return(0);
496: }
498: PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
499: {
500: PetscErrorCode ierr;
503: PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);
504: #if defined(PETSC_HAVE_NVSHMEM)
505: {
506: PetscBool use_nvshmem;
507: PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);
508: if (use_nvshmem) {
509: PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);
510: return(0);
511: }
512: }
513: #endif
514: PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);
515: return(0);
516: }
518: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
519: If the sf uses persistent requests and the requests have not been initialized, then initialize them.
520: */
521: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
522: {
523: PetscErrorCode ierr;
524: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
525: PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
526: const PetscInt *rootoffset,*leafoffset;
527: PetscMPIInt n;
528: MPI_Aint disp;
529: MPI_Comm comm = PetscObjectComm((PetscObject)sf);
530: MPI_Datatype unit = link->unit;
531: const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
532: const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
535: /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
536: if (sf->persistent) {
537: if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
538: PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);
539: if (direction == PETSCSF_LEAF2../../../../../..) {
540: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
541: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
542: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
543: MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
544: }
545: } else { /* PETSCSF_../../../../../..2LEAF */
546: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
547: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
548: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
549: MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
550: }
551: }
552: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
553: }
555: if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
556: PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);
557: if (direction == PETSCSF_LEAF2../../../../../..) {
558: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
559: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
560: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
561: MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
562: }
563: } else { /* PETSCSF_../../../../../..2LEAF */
564: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
565: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
566: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
567: MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
568: }
569: }
570: link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
571: }
572: }
573: if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
574: if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
575: if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
576: if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
577: return(0);
578: }
580: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
581: {
582: PetscErrorCode ierr;
583: PetscSFLink link,*p;
584: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
587: /* Look for types in cache */
588: for (p=&bas->inuse; (link=*p); p=&link->next) {
589: PetscBool match;
590: MPIPetsc_Type_compare(unit,link->unit,&match);
591: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
592: switch (cmode) {
593: case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
594: case PETSC_USE_POINTER: break;
595: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
596: }
597: *mylink = link;
598: return(0);
599: }
600: }
601: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
602: }
604: PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
605: {
606: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
607: PetscSFLink link = *mylink;
610: link->rootdata = NULL;
611: link->leafdata = NULL;
612: link->next = bas->avail;
613: bas->avail = link;
614: *mylink = NULL;
615: return(0);
616: }
618: /* Error out on unsupported overlapped communications */
619: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
620: {
621: PetscErrorCode ierr;
622: PetscSFLink link,*p;
623: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
624: PetscBool match;
627: if (PetscDefined(USE_DEBUG)) {
628: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
629: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
630: */
631: if (rootdata || leafdata) {
632: for (p=&bas->inuse; (link=*p); p=&link->next) {
633: MPIPetsc_Type_compare(unit,link->unit,&match);
634: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
635: }
636: }
637: }
638: return(0);
639: }
641: static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
642: {
644: if (n) {PetscErrorCode PetscMemcpy(dst,src,n);}
645: return(0);
646: }
648: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
649: {
651: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
652: PetscBool is2Int,is2PetscInt;
653: PetscMPIInt ni,na,nd,combiner;
654: #if defined(PETSC_HAVE_COMPLEX)
655: PetscInt nPetscComplex=0;
656: #endif
659: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
660: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
661: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
662: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
663: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
664: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
665: #if defined(PETSC_HAVE_COMPLEX)
666: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
667: #endif
668: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
669: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
670: /* TODO: shaell we also handle Fortran MPI_2REAL? */
671: MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
672: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
673: link->bs = 1; /* default */
675: if (is2Int) {
676: PackInit_PairType_int_int_1_1(link);
677: link->bs = 1;
678: link->unitbytes = 2*sizeof(int);
679: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
680: link->basicunit = MPI_2INT;
681: link->unit = MPI_2INT;
682: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
683: PackInit_PairType_PetscInt_PetscInt_1_1(link);
684: link->bs = 1;
685: link->unitbytes = 2*sizeof(PetscInt);
686: link->basicunit = MPIU_2INT;
687: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
688: link->unit = MPIU_2INT;
689: } else if (nPetscReal) {
690: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
691: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
692: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
693: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
694: link->bs = nPetscReal;
695: link->unitbytes = nPetscReal*sizeof(PetscReal);
696: link->basicunit = MPIU_REAL;
697: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
698: } else if (nPetscInt) {
699: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
700: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
701: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
702: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
703: link->bs = nPetscInt;
704: link->unitbytes = nPetscInt*sizeof(PetscInt);
705: link->basicunit = MPIU_INT;
706: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
707: #if defined(PETSC_USE_64BIT_INDICES)
708: } else if (nInt) {
709: if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
710: else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
711: else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
712: else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
713: link->bs = nInt;
714: link->unitbytes = nInt*sizeof(int);
715: link->basicunit = MPI_INT;
716: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
717: #endif
718: } else if (nSignedChar) {
719: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
720: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
721: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
722: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
723: link->bs = nSignedChar;
724: link->unitbytes = nSignedChar*sizeof(SignedChar);
725: link->basicunit = MPI_SIGNED_CHAR;
726: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
727: } else if (nUnsignedChar) {
728: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
729: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
730: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
731: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
732: link->bs = nUnsignedChar;
733: link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
734: link->basicunit = MPI_UNSIGNED_CHAR;
735: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
736: #if defined(PETSC_HAVE_COMPLEX)
737: } else if (nPetscComplex) {
738: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
739: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
740: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
741: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
742: link->bs = nPetscComplex;
743: link->unitbytes = nPetscComplex*sizeof(PetscComplex);
744: link->basicunit = MPIU_COMPLEX;
745: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
746: #endif
747: } else {
748: MPI_Aint lb,nbyte;
749: MPI_Type_get_extent(unit,&lb,&nbyte);
750: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
751: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
752: if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
753: else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
754: else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
755: link->bs = nbyte;
756: link->unitbytes = nbyte;
757: link->basicunit = MPI_BYTE;
758: } else {
759: nInt = nbyte / sizeof(int);
760: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
761: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
762: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
763: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
764: link->bs = nInt;
765: link->unitbytes = nbyte;
766: link->basicunit = MPI_INT;
767: }
768: if (link->isbuiltin) link->unit = unit;
769: }
771: if (!link->isbuiltin) {MPI_Type_dup(unit,&link->unit);}
773: link->Memcpy = PetscSFLinkMemcpy_Host;
774: return(0);
775: }
777: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
778: {
780: *UnpackAndOp = NULL;
781: if (PetscMemTypeHost(mtype)) {
782: if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
783: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
784: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
785: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
786: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
787: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
788: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
789: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
790: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
791: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
792: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
793: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
794: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
795: }
796: #if defined(PETSC_HAVE_DEVICE)
797: else if (PetscMemTypeDevice(mtype) && !atomic) {
798: if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
799: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
800: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
801: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
802: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
803: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
804: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
805: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
806: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
807: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
808: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
809: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
810: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
811: } else if (PetscMemTypeDevice(mtype) && atomic) {
812: if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
813: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
814: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
815: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
816: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
817: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
818: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
819: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
820: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
821: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
822: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
823: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
824: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
825: }
826: #endif
827: return(0);
828: }
830: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
831: {
833: *ScatterAndOp = NULL;
834: if (PetscMemTypeHost(mtype)) {
835: if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
836: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
837: else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
838: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
839: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
840: else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
841: else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
842: else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
843: else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
844: else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
845: else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
846: else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
847: else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
848: }
849: #if defined(PETSC_HAVE_DEVICE)
850: else if (PetscMemTypeDevice(mtype) && !atomic) {
851: if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
852: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
853: else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
854: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
855: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
856: else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
857: else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
858: else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
859: else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
860: else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
861: else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
862: else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
863: else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
864: } else if (PetscMemTypeDevice(mtype) && atomic) {
865: if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
866: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
867: else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
868: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
869: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
870: else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
871: else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
872: else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
873: else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
874: else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
875: else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
876: else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
877: else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
878: }
879: #endif
880: return(0);
881: }
883: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
884: {
886: *FetchAndOp = NULL;
887: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
888: if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
889: #if defined(PETSC_HAVE_DEVICE)
890: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
891: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
892: #endif
893: return(0);
894: }
896: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
897: {
899: *FetchAndOpLocal = NULL;
900: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
901: if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
902: #if defined(PETSC_HAVE_DEVICE)
903: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
904: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
905: #endif
906: return(0);
907: }
909: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
910: {
912: PetscLogDouble flops;
913: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
916: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
917: flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
918: #if defined(PETSC_HAVE_DEVICE)
919: if (PetscMemTypeDevice(link->rootmtype)) {PetscLogGpuFlops(flops);} else
920: #endif
921: {PetscLogFlops(flops);}
922: }
923: return(0);
924: }
926: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
927: {
928: PetscLogDouble flops;
932: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
933: flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
934: #if defined(PETSC_HAVE_DEVICE)
935: if (PetscMemTypeDevice(link->leafmtype)) {PetscLogGpuFlops(flops);} else
936: #endif
937: {PetscLogFlops(flops);}
938: }
939: return(0);
940: }
942: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
943: Input Parameters:
944: +sf - The StarForest
945: .link - The link
946: .count - Number of entries to unpack
947: .start - The first index, significent when indices=NULL
948: .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
949: .buf - A contiguous buffer to unpack from
950: -op - Operation after unpack
952: Output Parameters:
953: .data - The data to unpack to
954: */
955: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
956: {
958: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
959: {
961: PetscInt i;
962: PetscMPIInt n;
963: if (indices) {
964: /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
965: basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
966: */
967: for (i=0; i<count; i++) {MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);}
968: } else {
969: PetscMPIIntCast(count,&n);
970: MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);
971: }
972: }
973: return(0);
974: #else
975: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
976: #endif
977: }
979: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
980: {
982: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
983: {
985: PetscInt i,disp;
986: if (!srcIdx) {
987: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);
988: } else {
989: for (i=0; i<count; i++) {
990: disp = dstIdx? dstIdx[i] : dstStart + i;
991: MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);
992: }
993: }
994: }
995: return(0);
996: #else
997: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
998: #endif
999: }
1001: /*=============================================================================
1002: Pack/Unpack/Fetch/Scatter routines
1003: ============================================================================*/
1005: /* Pack rootdata to rootbuf
1006: Input Parameters:
1007: + sf - The SF this packing works on.
1008: . link - It gives the memtype of the roots and also provides root buffer.
1009: . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1010: - rootdata - Where to read the roots.
1012: Notes:
1013: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1014: in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1015: */
1016: PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1017: {
1018: PetscErrorCode ierr;
1019: const PetscInt *rootindices = NULL;
1020: PetscInt count,start;
1021: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1022: PetscMemType rootmtype = link->rootmtype;
1023: PetscSFPackOpt opt = NULL;
1026: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1027: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1028: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1029: PetscSFLinkGetPack(link,rootmtype,&Pack);
1030: (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1031: }
1032: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1033: return(0);
1034: }
1036: /* Pack leafdata to leafbuf */
1037: PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1038: {
1039: PetscErrorCode ierr;
1040: const PetscInt *leafindices = NULL;
1041: PetscInt count,start;
1042: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1043: PetscMemType leafmtype = link->leafmtype;
1044: PetscSFPackOpt opt = NULL;
1047: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1048: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1049: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1050: PetscSFLinkGetPack(link,leafmtype,&Pack);
1051: (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1052: }
1053: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1054: return(0);
1055: }
1057: /* Pack rootdata to rootbuf, which are in the same memory space */
1058: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1059: {
1060: PetscErrorCode ierr;
1061: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1064: if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1065: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1066: if (link->PrePack) {(*link->PrePack)(sf,link,PETSCSF_../../../../../..2LEAF);} /* Used by SF nvshmem */
1067: }
1068: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1069: if (bas->rootbuflen[scope]) {PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);}
1070: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1071: return(0);
1072: }
1073: /* Pack leafdata to leafbuf, which are in the same memory space */
1074: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1075: {
1076: PetscErrorCode ierr;
1079: if (scope == PETSCSF_REMOTE) {
1080: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1081: if (link->PrePack) {(*link->PrePack)(sf,link,PETSCSF_LEAF2../../../../../..);} /* Used by SF nvshmem */
1082: }
1083: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1084: if (sf->leafbuflen[scope]) {PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);}
1085: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1086: return(0);
1087: }
1089: PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1090: {
1091: PetscErrorCode ierr;
1092: const PetscInt *rootindices = NULL;
1093: PetscInt count,start;
1094: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1095: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1096: PetscMemType rootmtype = link->rootmtype;
1097: PetscSFPackOpt opt = NULL;
1100: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1101: PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);
1102: if (UnpackAndOp) {
1103: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1104: (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1105: } else {
1106: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);
1107: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);
1108: }
1109: }
1110: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1111: return(0);
1112: }
1114: PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1115: {
1116: PetscErrorCode ierr;
1117: const PetscInt *leafindices = NULL;
1118: PetscInt count,start;
1119: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1120: PetscMemType leafmtype = link->leafmtype;
1121: PetscSFPackOpt opt = NULL;
1124: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1125: PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);
1126: if (UnpackAndOp) {
1127: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1128: (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1129: } else {
1130: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);
1131: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);
1132: }
1133: }
1134: PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);
1135: return(0);
1136: }
1137: /* Unpack rootbuf to rootdata, which are in the same memory space */
1138: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1139: {
1140: PetscErrorCode ierr;
1141: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1144: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1145: if (bas->rootbuflen[scope]) {PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);}
1146: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1147: if (scope == PETSCSF_REMOTE) {
1148: if (link->PostUnpack) {(*link->PostUnpack)(sf,link,PETSCSF_LEAF2../../../../../..);} /* Used by SF nvshmem */
1149: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1150: }
1151: return(0);
1152: }
1154: /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1155: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1156: {
1157: PetscErrorCode ierr;
1160: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1161: if (sf->leafbuflen[scope]) {PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);}
1162: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1163: if (scope == PETSCSF_REMOTE) {
1164: if (link->PostUnpack) {(*link->PostUnpack)(sf,link,PETSCSF_../../../../../..2LEAF);} /* Used by SF nvshmem */
1165: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1166: }
1167: return(0);
1168: }
1170: /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1171: PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1172: {
1173: PetscErrorCode ierr;
1174: const PetscInt *rootindices = NULL;
1175: PetscInt count,start;
1176: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1177: PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1178: PetscMemType rootmtype = link->rootmtype;
1179: PetscSFPackOpt opt = NULL;
1182: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1183: if (bas->rootbuflen[PETSCSF_REMOTE]) {
1184: /* Do FetchAndOp on rootdata with rootbuf */
1185: PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);
1186: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);
1187: (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);
1188: }
1189: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);
1190: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1191: return(0);
1192: }
1194: PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1195: {
1196: PetscErrorCode ierr;
1197: const PetscInt *rootindices = NULL,*leafindices = NULL;
1198: PetscInt count,rootstart,leafstart;
1199: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1200: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1201: PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1202: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1203: PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL];
1204: char *srcbuf = NULL,*dstbuf = NULL;
1205: PetscBool dstdups;
1208: if (!buflen) return(0);
1209: if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1210: if (direction == PETSCSF_../../../../../..2LEAF) {
1211: PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);
1212: srcmtype = rootmtype;
1213: srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1214: dstmtype = leafmtype;
1215: dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1216: } else {
1217: PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);
1218: srcmtype = leafmtype;
1219: srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1220: dstmtype = rootmtype;
1221: dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1222: }
1223: (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);
1224: /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1225: if (PetscMemTypeHost(dstmtype)) {(*link->SyncStream)(link);}
1226: if (direction == PETSCSF_../../../../../..2LEAF) {
1227: PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);
1228: } else {
1229: PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);
1230: }
1231: } else {
1232: dstdups = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1233: dstmtype = (direction == PETSCSF_../../../../../..2LEAF) ? link->leafmtype : link->rootmtype;
1234: PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);
1235: if (ScatterAndOp) {
1236: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1237: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1238: if (direction == PETSCSF_../../../../../..2LEAF) {
1239: (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);
1240: } else {
1241: (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);
1242: }
1243: } else {
1244: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1245: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1246: if (direction == PETSCSF_../../../../../..2LEAF) {
1247: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);
1248: } else {
1249: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);
1250: }
1251: }
1252: }
1253: return(0);
1254: }
1256: /* Fetch rootdata to leafdata and leafupdate locally */
1257: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1258: {
1259: PetscErrorCode ierr;
1260: const PetscInt *rootindices = NULL,*leafindices = NULL;
1261: PetscInt count,rootstart,leafstart;
1262: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1263: PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1264: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1265: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1268: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1269: if (rootmtype != leafmtype) {
1270: /* The local communication has to go through pack and unpack */
1271: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1272: } else {
1273: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1274: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1275: PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);
1276: (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);
1277: }
1278: return(0);
1279: }
1281: /*
1282: Create per-rank pack/unpack optimizations based on indice patterns
1284: Input Parameters:
1285: + n - Number of destination ranks
1286: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1287: - idx - [*] Array storing indices
1289: Output Parameters:
1290: + opt - Pack optimizations. NULL if no optimizations.
1291: */
1292: PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1293: {
1295: PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1296: PetscBool optimizable = PETSC_TRUE;
1297: PetscSFPackOpt opt;
1300: PetscMalloc1(1,&opt);
1301: PetscMalloc1(7*n+2,&opt->array);
1302: opt->n = opt->array[0] = n;
1303: opt->offset = opt->array + 1;
1304: opt->start = opt->array + n + 2;
1305: opt->dx = opt->array + 2*n + 2;
1306: opt->dy = opt->array + 3*n + 2;
1307: opt->dz = opt->array + 4*n + 2;
1308: opt->X = opt->array + 5*n + 2;
1309: opt->Y = opt->array + 6*n + 2;
1311: for (r=0; r<n; r++) { /* For each destination rank */
1312: m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1313: p = offset[r];
1314: start = idx[p]; /* First index for this rank */
1315: p++;
1317: /* Search in X dimension */
1318: for (dx=1; dx<m; dx++,p++) {
1319: if (start+dx != idx[p]) break;
1320: }
1322: dydz = m/dx;
1323: X = dydz > 1 ? (idx[p]-start) : dx;
1324: /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1325: if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1326: for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1327: for (i=0; i<dx; i++,p++) {
1328: if (start+X*dy+i != idx[p]) {
1329: if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1330: else goto Z_dimension;
1331: }
1332: }
1333: }
1335: Z_dimension:
1336: dz = m/(dx*dy);
1337: Y = dz > 1 ? (idx[p]-start)/X : dy;
1338: /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1339: if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1340: for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1341: for (j=0; j<dy; j++) {
1342: for (i=0; i<dx; i++,p++) {
1343: if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1344: }
1345: }
1346: }
1347: opt->start[r] = start;
1348: opt->dx[r] = dx;
1349: opt->dy[r] = dy;
1350: opt->dz[r] = dz;
1351: opt->X[r] = X;
1352: opt->Y[r] = Y;
1353: }
1355: finish:
1356: /* If not optimizable, free arrays to save memory */
1357: if (!n || !optimizable) {
1358: PetscFree(opt->array);
1359: PetscFree(opt);
1360: *out = NULL;
1361: } else {
1362: opt->offset[0] = 0;
1363: for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1364: *out = opt;
1365: }
1366: return(0);
1367: }
1369: PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1370: {
1372: PetscSFPackOpt opt = *out;
1375: if (opt) {
1376: PetscSFFree(sf,mtype,opt->array);
1377: PetscFree(opt);
1378: *out = NULL;
1379: }
1380: return(0);
1381: }
1383: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1384: {
1386: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1387: PetscInt i,j;
1390: /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1391: for (i=0; i<2; i++) { /* Set defaults */
1392: sf->leafstart[i] = 0;
1393: sf->leafcontig[i] = PETSC_TRUE;
1394: sf->leafdups[i] = PETSC_FALSE;
1395: bas->rootstart[i] = 0;
1396: bas->rootcontig[i] = PETSC_TRUE;
1397: bas->rootdups[i] = PETSC_FALSE;
1398: }
1400: sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1401: sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1403: if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1404: if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1406: /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1407: for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1408: if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1409: }
1410: for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1411: if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1412: }
1414: /* If not, see if we can have per-rank optimizations by doing index analysis */
1415: if (!sf->leafcontig[0]) {PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);}
1416: if (!sf->leafcontig[1]) {PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);}
1418: /* Are root indices for self and remote contiguous? */
1419: bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1420: bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1422: if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1423: if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1425: for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1426: if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1427: }
1428: for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1429: if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1430: }
1432: if (!bas->rootcontig[0]) {PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);}
1433: if (!bas->rootcontig[1]) {PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);}
1435: #if defined(PETSC_HAVE_DEVICE)
1436: /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1437: if (PetscDefined(HAVE_DEVICE)) {
1438: PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1443: }
1444: #endif
1445: return(0);
1446: }
1448: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1449: {
1451: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1452: PetscInt i;
1455: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1456: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);
1457: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);
1458: #if defined(PETSC_HAVE_DEVICE)
1459: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);
1460: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);
1461: #endif
1462: }
1463: return(0);
1464: }