Actual source code: aijperm.c
2: /*
3: Defines basic operations for the MATSEQAIJPERM matrix class.
4: This class is derived from the MATSEQAIJ class and retains the
5: compressed row storage (aka Yale sparse matrix format) but augments
6: it with some permutation information that enables some operations
7: to be more vectorizable. A physically rearranged copy of the matrix
8: may be stored if the user desires.
10: Eventually a variety of permutations may be supported.
11: */
13: #include <../src/mat/impls/aij/seq/aij.h>
15: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16: #include <immintrin.h>
18: #if !defined(_MM_SCALE_8)
19: #define _MM_SCALE_8 8
20: #endif
21: #if !defined(_MM_SCALE_4)
22: #define _MM_SCALE_4 4
23: #endif
24: #endif
26: #define NDIM 512
27: /* NDIM specifies how many rows at a time we should work with when
28: * performing the vectorized mat-vec. This depends on various factors
29: * such as vector register length, etc., and I really need to add a
30: * way for the user (or the library) to tune this. I'm setting it to
31: * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32: * routines. */
34: typedef struct {
35: PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
37: PetscInt ngroup;
38: PetscInt *xgroup;
39: /* Denotes where groups of rows with same number of nonzeros
40: * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41: * where the ith group begins. */
43: PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */
44: PetscInt *iperm; /* The permutation vector. */
46: /* Some of this stuff is for Ed's recursive triangular solve.
47: * I'm not sure what I need yet. */
48: PetscInt blocksize;
49: PetscInt nstep;
50: PetscInt *jstart_list;
51: PetscInt *jend_list;
52: PetscInt *action_list;
53: PetscInt *ngroup_list;
54: PetscInt **ipointer_list;
55: PetscInt **xgroup_list;
56: PetscInt **nzgroup_list;
57: PetscInt **iperm_list;
58: } Mat_SeqAIJPERM;
60: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61: {
62: /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63: /* so we will ignore 'MatType type'. */
65: Mat B = *newmat;
66: Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
69: if (reuse == MAT_INITIAL_MATRIX) {
70: MatDuplicate(A,MAT_COPY_VALUES,&B);
71: aijperm=(Mat_SeqAIJPERM*)B->spptr;
72: }
74: /* Reset the original function pointers. */
75: B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76: B->ops->destroy = MatDestroy_SeqAIJ;
77: B->ops->duplicate = MatDuplicate_SeqAIJ;
78: B->ops->mult = MatMult_SeqAIJ;
79: B->ops->multadd = MatMultAdd_SeqAIJ;
81: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);
83: /* Free everything in the Mat_SeqAIJPERM data structure.*/
84: PetscFree(aijperm->xgroup);
85: PetscFree(aijperm->nzgroup);
86: PetscFree(aijperm->iperm);
87: PetscFree(B->spptr);
89: /* Change the type of B to MATSEQAIJ. */
90: PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);
92: *newmat = B;
93: return(0);
94: }
96: PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
97: {
99: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
102: if (aijperm) {
103: /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
104: PetscFree(aijperm->xgroup);
105: PetscFree(aijperm->nzgroup);
106: PetscFree(aijperm->iperm);
107: PetscFree(A->spptr);
108: }
109: /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
110: * to destroy everything that remains. */
111: PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);
112: /* Note that I don't call MatSetType(). I believe this is because that
113: * is only to be called when *building* a matrix. I could be wrong, but
114: * that is how things work for the SuperLU matrix class. */
115: MatDestroy_SeqAIJ(A);
116: return(0);
117: }
119: PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
120: {
122: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
123: Mat_SeqAIJPERM *aijperm_dest;
124: PetscBool perm;
127: MatDuplicate_SeqAIJ(A,op,M);
128: PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);
129: if (perm) {
130: aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
131: PetscFree(aijperm_dest->xgroup);
132: PetscFree(aijperm_dest->nzgroup);
133: PetscFree(aijperm_dest->iperm);
134: } else {
135: PetscNewLog(*M,&aijperm_dest);
136: (*M)->spptr = (void*) aijperm_dest;
137: PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);
138: PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
139: }
140: PetscArraycpy(aijperm_dest,aijperm,1);
141: /* Allocate space for, and copy the grouping and permutation info.
142: * I note that when the groups are initially determined in
143: * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
144: * necessary. But at this point, we know how large they need to be, and
145: * allocate only the necessary amount of memory. So the duplicated matrix
146: * may actually use slightly less storage than the original! */
147: PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);
148: PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);
149: PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);
150: PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);
151: PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);
152: PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);
153: return(0);
154: }
156: PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
157: {
159: Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data;
160: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
161: PetscInt m; /* Number of rows in the matrix. */
162: PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */
163: PetscInt maxnz; /* Maximum number of nonzeros in any row. */
164: PetscInt *rows_in_bucket;
165: /* To construct the permutation, we sort each row into one of maxnz
166: * buckets based on how many nonzeros are in the row. */
167: PetscInt nz;
168: PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
169: PetscInt *ipnz;
170: /* When constructing the iperm permutation vector,
171: * ipnz[nz] is used to point to the next place in the permutation vector
172: * that a row with nz nonzero elements should be placed.*/
173: PetscInt i, ngroup, istart, ipos;
176: if (aijperm->nonzerostate == A->nonzerostate) return(0); /* permutation exists and matches current nonzero structure */
177: aijperm->nonzerostate = A->nonzerostate;
178: /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
179: PetscFree(aijperm->xgroup);
180: PetscFree(aijperm->nzgroup);
181: PetscFree(aijperm->iperm);
183: m = A->rmap->n;
184: ia = a->i;
186: /* Allocate the arrays that will hold the permutation vector. */
187: PetscMalloc1(m, &aijperm->iperm);
189: /* Allocate some temporary work arrays that will be used in
190: * calculating the permuation vector and groupings. */
191: PetscMalloc1(m, &nz_in_row);
193: /* Now actually figure out the permutation and grouping. */
195: /* First pass: Determine number of nonzeros in each row, maximum
196: * number of nonzeros in any row, and how many rows fall into each
197: * "bucket" of rows with same number of nonzeros. */
198: maxnz = 0;
199: for (i=0; i<m; i++) {
200: nz_in_row[i] = ia[i+1]-ia[i];
201: if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
202: }
203: PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);
204: PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);
206: for (i=0; i<=maxnz; i++) {
207: rows_in_bucket[i] = 0;
208: }
209: for (i=0; i<m; i++) {
210: nz = nz_in_row[i];
211: rows_in_bucket[nz]++;
212: }
214: /* Allocate space for the grouping info. There will be at most (maxnz + 1)
215: * groups. (It is maxnz + 1 instead of simply maxnz because there may be
216: * rows with no nonzero elements.) If there are (maxnz + 1) groups,
217: * then xgroup[] must consist of (maxnz + 2) elements, since the last
218: * element of xgroup will tell us where the (maxnz + 1)th group ends.
219: * We allocate space for the maximum number of groups;
220: * that is potentially a little wasteful, but not too much so.
221: * Perhaps I should fix it later. */
222: PetscMalloc1(maxnz+2, &aijperm->xgroup);
223: PetscMalloc1(maxnz+1, &aijperm->nzgroup);
225: /* Second pass. Look at what is in the buckets and create the groupings.
226: * Note that it is OK to have a group of rows with no non-zero values. */
227: ngroup = 0;
228: istart = 0;
229: for (i=0; i<=maxnz; i++) {
230: if (rows_in_bucket[i] > 0) {
231: aijperm->nzgroup[ngroup] = i;
232: aijperm->xgroup[ngroup] = istart;
233: ngroup++;
234: istart += rows_in_bucket[i];
235: }
236: }
238: aijperm->xgroup[ngroup] = istart;
239: aijperm->ngroup = ngroup;
241: /* Now fill in the permutation vector iperm. */
242: ipnz[0] = 0;
243: for (i=0; i<maxnz; i++) {
244: ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
245: }
247: for (i=0; i<m; i++) {
248: nz = nz_in_row[i];
249: ipos = ipnz[nz];
250: aijperm->iperm[ipos] = i;
251: ipnz[nz]++;
252: }
254: /* Clean up temporary work arrays. */
255: PetscFree(rows_in_bucket);
256: PetscFree(ipnz);
257: PetscFree(nz_in_row);
258: return(0);
259: }
261: PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
262: {
264: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
267: if (mode == MAT_FLUSH_ASSEMBLY) return(0);
269: /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
270: * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
271: * I'm not sure if this is the best way to do this, but it avoids
272: * a lot of code duplication.
273: * I also note that currently MATSEQAIJPERM doesn't know anything about
274: * the Mat_CompressedRow data structure that SeqAIJ now uses when there
275: * are many zero rows. If the SeqAIJ assembly end routine decides to use
276: * this, this may break things. (Don't know... haven't looked at it.) */
277: a->inode.use = PETSC_FALSE;
278: MatAssemblyEnd_SeqAIJ(A, mode);
280: /* Now calculate the permutation and grouping information. */
281: MatSeqAIJPERM_create_perm(A);
282: return(0);
283: }
285: PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
286: {
287: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
288: const PetscScalar *x;
289: PetscScalar *y;
290: const MatScalar *aa;
291: PetscErrorCode ierr;
292: const PetscInt *aj,*ai;
293: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
294: PetscInt i,j;
295: #endif
296: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
297: __m512d vec_x,vec_y,vec_vals;
298: __m256i vec_idx,vec_ipos,vec_j;
299: __mmask8 mask;
300: #endif
302: /* Variables that don't appear in MatMult_SeqAIJ. */
303: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
304: PetscInt *iperm; /* Points to the permutation vector. */
305: PetscInt *xgroup;
306: /* Denotes where groups of rows with same number of nonzeros
307: * begin and end in iperm. */
308: PetscInt *nzgroup;
309: PetscInt ngroup;
310: PetscInt igroup;
311: PetscInt jstart,jend;
312: /* jstart is used in loops to denote the position in iperm where a
313: * group starts; jend denotes the position where it ends.
314: * (jend + 1 is where the next group starts.) */
315: PetscInt iold,nz;
316: PetscInt istart,iend,isize;
317: PetscInt ipos;
318: PetscScalar yp[NDIM];
319: PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
321: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
322: #pragma disjoint(*x,*y,*aa)
323: #endif
326: VecGetArrayRead(xx,&x);
327: VecGetArray(yy,&y);
328: aj = a->j; /* aj[k] gives column index for element aa[k]. */
329: aa = a->a; /* Nonzero elements stored row-by-row. */
330: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
332: /* Get the info we need about the permutations and groupings. */
333: iperm = aijperm->iperm;
334: ngroup = aijperm->ngroup;
335: xgroup = aijperm->xgroup;
336: nzgroup = aijperm->nzgroup;
338: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
339: fortranmultaijperm_(&m,x,ii,aj,aa,y);
340: #else
342: for (igroup=0; igroup<ngroup; igroup++) {
343: jstart = xgroup[igroup];
344: jend = xgroup[igroup+1] - 1;
345: nz = nzgroup[igroup];
347: /* Handle the special cases where the number of nonzeros per row
348: * in the group is either 0 or 1. */
349: if (nz == 0) {
350: for (i=jstart; i<=jend; i++) {
351: y[iperm[i]] = 0.0;
352: }
353: } else if (nz == 1) {
354: for (i=jstart; i<=jend; i++) {
355: iold = iperm[i];
356: ipos = ai[iold];
357: y[iold] = aa[ipos] * x[aj[ipos]];
358: }
359: } else {
361: /* We work our way through the current group in chunks of NDIM rows
362: * at a time. */
364: for (istart=jstart; istart<=jend; istart+=NDIM) {
365: /* Figure out where the chunk of 'isize' rows ends in iperm.
366: * 'isize may of course be less than NDIM for the last chunk. */
367: iend = istart + (NDIM - 1);
369: if (iend > jend) iend = jend;
371: isize = iend - istart + 1;
373: /* Initialize the yp[] array that will be used to hold part of
374: * the permuted results vector, and figure out where in aa each
375: * row of the chunk will begin. */
376: for (i=0; i<isize; i++) {
377: iold = iperm[istart + i];
378: /* iold is a row number from the matrix A *before* reordering. */
379: ip[i] = ai[iold];
380: /* ip[i] tells us where the ith row of the chunk begins in aa. */
381: yp[i] = (PetscScalar) 0.0;
382: }
384: /* If the number of zeros per row exceeds the number of rows in
385: * the chunk, we should vectorize along nz, that is, perform the
386: * mat-vec one row at a time as in the usual CSR case. */
387: if (nz > isize) {
388: #if defined(PETSC_HAVE_CRAY_VECTOR)
389: #pragma _CRI preferstream
390: #endif
391: for (i=0; i<isize; i++) {
392: #if defined(PETSC_HAVE_CRAY_VECTOR)
393: #pragma _CRI prefervector
394: #endif
396: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
397: vec_y = _mm512_setzero_pd();
398: ipos = ip[i];
399: for (j=0; j<(nz>>3); j++) {
400: vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
401: vec_vals = _mm512_loadu_pd(&aa[ipos]);
402: vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
403: vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
404: ipos += 8;
405: }
406: if ((nz&0x07)>2) {
407: mask = (__mmask8)(0xff >> (8-(nz&0x07)));
408: vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
409: vec_vals = _mm512_loadu_pd(&aa[ipos]);
410: vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
411: vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
412: } else if ((nz&0x07)==2) {
413: yp[i] += aa[ipos]*x[aj[ipos]];
414: yp[i] += aa[ipos+1]*x[aj[ipos+1]];
415: } else if ((nz&0x07)==1) {
416: yp[i] += aa[ipos]*x[aj[ipos]];
417: }
418: yp[i] += _mm512_reduce_add_pd(vec_y);
419: #else
420: for (j=0; j<nz; j++) {
421: ipos = ip[i] + j;
422: yp[i] += aa[ipos] * x[aj[ipos]];
423: }
424: #endif
425: }
426: } else {
427: /* Otherwise, there are enough rows in the chunk to make it
428: * worthwhile to vectorize across the rows, that is, to do the
429: * matvec by operating with "columns" of the chunk. */
430: for (j=0; j<nz; j++) {
431: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
432: vec_j = _mm256_set1_epi32(j);
433: for (i=0; i<((isize>>3)<<3); i+=8) {
434: vec_y = _mm512_loadu_pd(&yp[i]);
435: vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
436: vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
437: vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
438: vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
439: vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
440: vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
441: _mm512_storeu_pd(&yp[i],vec_y);
442: }
443: for (i=isize-(isize&0x07); i<isize; i++) {
444: ipos = ip[i]+j;
445: yp[i] += aa[ipos]*x[aj[ipos]];
446: }
447: #else
448: for (i=0; i<isize; i++) {
449: ipos = ip[i] + j;
450: yp[i] += aa[ipos] * x[aj[ipos]];
451: }
452: #endif
453: }
454: }
456: #if defined(PETSC_HAVE_CRAY_VECTOR)
457: #pragma _CRI ivdep
458: #endif
459: /* Put results from yp[] into non-permuted result vector y. */
460: for (i=0; i<isize; i++) {
461: y[iperm[istart+i]] = yp[i];
462: }
463: } /* End processing chunk of isize rows of a group. */
464: } /* End handling matvec for chunk with nz > 1. */
465: } /* End loop over igroup. */
466: #endif
467: PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));
468: VecRestoreArrayRead(xx,&x);
469: VecRestoreArray(yy,&y);
470: return(0);
471: }
473: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
474: * Note that the names I used to designate the vectors differs from that
475: * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent
476: * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
477: /*
478: I hate having virtually identical code for the mult and the multadd!!!
479: */
480: PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
481: {
482: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
483: const PetscScalar *x;
484: PetscScalar *y,*w;
485: const MatScalar *aa;
486: PetscErrorCode ierr;
487: const PetscInt *aj,*ai;
488: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
489: PetscInt i,j;
490: #endif
492: /* Variables that don't appear in MatMultAdd_SeqAIJ. */
493: Mat_SeqAIJPERM * aijperm;
494: PetscInt *iperm; /* Points to the permutation vector. */
495: PetscInt *xgroup;
496: /* Denotes where groups of rows with same number of nonzeros
497: * begin and end in iperm. */
498: PetscInt *nzgroup;
499: PetscInt ngroup;
500: PetscInt igroup;
501: PetscInt jstart,jend;
502: /* jstart is used in loops to denote the position in iperm where a
503: * group starts; jend denotes the position where it ends.
504: * (jend + 1 is where the next group starts.) */
505: PetscInt iold,nz;
506: PetscInt istart,iend,isize;
507: PetscInt ipos;
508: PetscScalar yp[NDIM];
509: PetscInt ip[NDIM];
510: /* yp[] and ip[] are treated as vector "registers" for performing
511: * the mat-vec. */
513: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
514: #pragma disjoint(*x,*y,*aa)
515: #endif
518: VecGetArrayRead(xx,&x);
519: VecGetArrayPair(yy,ww,&y,&w);
521: aj = a->j; /* aj[k] gives column index for element aa[k]. */
522: aa = a->a; /* Nonzero elements stored row-by-row. */
523: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
525: /* Get the info we need about the permutations and groupings. */
526: aijperm = (Mat_SeqAIJPERM*) A->spptr;
527: iperm = aijperm->iperm;
528: ngroup = aijperm->ngroup;
529: xgroup = aijperm->xgroup;
530: nzgroup = aijperm->nzgroup;
532: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
533: fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
534: #else
536: for (igroup=0; igroup<ngroup; igroup++) {
537: jstart = xgroup[igroup];
538: jend = xgroup[igroup+1] - 1;
540: nz = nzgroup[igroup];
542: /* Handle the special cases where the number of nonzeros per row
543: * in the group is either 0 or 1. */
544: if (nz == 0) {
545: for (i=jstart; i<=jend; i++) {
546: iold = iperm[i];
547: y[iold] = w[iold];
548: }
549: }
550: else if (nz == 1) {
551: for (i=jstart; i<=jend; i++) {
552: iold = iperm[i];
553: ipos = ai[iold];
554: y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
555: }
556: }
557: /* For the general case: */
558: else {
560: /* We work our way through the current group in chunks of NDIM rows
561: * at a time. */
563: for (istart=jstart; istart<=jend; istart+=NDIM) {
564: /* Figure out where the chunk of 'isize' rows ends in iperm.
565: * 'isize may of course be less than NDIM for the last chunk. */
566: iend = istart + (NDIM - 1);
567: if (iend > jend) iend = jend;
568: isize = iend - istart + 1;
570: /* Initialize the yp[] array that will be used to hold part of
571: * the permuted results vector, and figure out where in aa each
572: * row of the chunk will begin. */
573: for (i=0; i<isize; i++) {
574: iold = iperm[istart + i];
575: /* iold is a row number from the matrix A *before* reordering. */
576: ip[i] = ai[iold];
577: /* ip[i] tells us where the ith row of the chunk begins in aa. */
578: yp[i] = w[iold];
579: }
581: /* If the number of zeros per row exceeds the number of rows in
582: * the chunk, we should vectorize along nz, that is, perform the
583: * mat-vec one row at a time as in the usual CSR case. */
584: if (nz > isize) {
585: #if defined(PETSC_HAVE_CRAY_VECTOR)
586: #pragma _CRI preferstream
587: #endif
588: for (i=0; i<isize; i++) {
589: #if defined(PETSC_HAVE_CRAY_VECTOR)
590: #pragma _CRI prefervector
591: #endif
592: for (j=0; j<nz; j++) {
593: ipos = ip[i] + j;
594: yp[i] += aa[ipos] * x[aj[ipos]];
595: }
596: }
597: }
598: /* Otherwise, there are enough rows in the chunk to make it
599: * worthwhile to vectorize across the rows, that is, to do the
600: * matvec by operating with "columns" of the chunk. */
601: else {
602: for (j=0; j<nz; j++) {
603: for (i=0; i<isize; i++) {
604: ipos = ip[i] + j;
605: yp[i] += aa[ipos] * x[aj[ipos]];
606: }
607: }
608: }
610: #if defined(PETSC_HAVE_CRAY_VECTOR)
611: #pragma _CRI ivdep
612: #endif
613: /* Put results from yp[] into non-permuted result vector y. */
614: for (i=0; i<isize; i++) {
615: y[iperm[istart+i]] = yp[i];
616: }
617: } /* End processing chunk of isize rows of a group. */
619: } /* End handling matvec for chunk with nz > 1. */
620: } /* End loop over igroup. */
622: #endif
623: PetscLogFlops(2.0*a->nz);
624: VecRestoreArrayRead(xx,&x);
625: VecRestoreArrayPair(yy,ww,&y,&w);
626: return(0);
627: }
629: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
630: * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM()
631: * routine, but can also be used to convert an assembled SeqAIJ matrix
632: * into a SeqAIJPERM one. */
633: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
634: {
636: Mat B = *newmat;
637: Mat_SeqAIJPERM *aijperm;
638: PetscBool sametype;
641: if (reuse == MAT_INITIAL_MATRIX) {
642: MatDuplicate(A,MAT_COPY_VALUES,&B);
643: }
644: PetscObjectTypeCompare((PetscObject)A,type,&sametype);
645: if (sametype) return(0);
647: PetscNewLog(B,&aijperm);
648: B->spptr = (void*) aijperm;
650: /* Set function pointers for methods that we inherit from AIJ but override. */
651: B->ops->duplicate = MatDuplicate_SeqAIJPERM;
652: B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
653: B->ops->destroy = MatDestroy_SeqAIJPERM;
654: B->ops->mult = MatMult_SeqAIJPERM;
655: B->ops->multadd = MatMultAdd_SeqAIJPERM;
657: aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
658: /* If A has already been assembled, compute the permutation. */
659: if (A->assembled) {
660: MatSeqAIJPERM_create_perm(B);
661: }
663: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);
665: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);
666: *newmat = B;
667: return(0);
668: }
670: /*@C
671: MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
672: This type inherits from AIJ, but calculates some additional permutation
673: information that is used to allow better vectorization of some
674: operations. At the cost of increased storage, the AIJ formatted
675: matrix can be copied to a format in which pieces of the matrix are
676: stored in ELLPACK format, allowing the vectorized matrix multiply
677: routine to use stride-1 memory accesses. As with the AIJ type, it is
678: important to preallocate matrix storage in order to get good assembly
679: performance.
681: Collective
683: Input Parameters:
684: + comm - MPI communicator, set to PETSC_COMM_SELF
685: . m - number of rows
686: . n - number of columns
687: . nz - number of nonzeros per row (same for all rows)
688: - nnz - array containing the number of nonzeros in the various rows
689: (possibly different for each row) or NULL
691: Output Parameter:
692: . A - the matrix
694: Notes:
695: If nnz is given then nz is ignored
697: Level: intermediate
699: .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
700: @*/
701: PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
702: {
706: MatCreate(comm,A);
707: MatSetSizes(*A,m,n,m,n);
708: MatSetType(*A,MATSEQAIJPERM);
709: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);
710: return(0);
711: }
713: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
714: {
718: MatSetType(A,MATSEQAIJ);
719: MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);
720: return(0);
721: }