Actual source code: sbaijfact2.c


  2: /*
  3:     Factorization code for SBAIJ format.
  4: */

  6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7: #include <../src/mat/impls/baij/seq/baij.h>
  8: #include <petsc/private/kernels/blockinvert.h>

 10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 11: {
 12:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
 13:   IS                isrow=a->row;
 14:   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
 15:   PetscErrorCode    ierr;
 16:   const PetscInt    *r;
 17:   PetscInt          nz,*vj,k,idx,k1;
 18:   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
 19:   const MatScalar   *aa=a->a,*v,*diag;
 20:   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
 21:   const PetscScalar *b;

 24:   VecGetArrayRead(bb,&b);
 25:   VecGetArray(xx,&x);
 26:   t    = a->solve_work;
 27:   ISGetIndices(isrow,&r);
 28:   PetscMalloc1(bs,&xk_tmp);

 30:   /* solve U^T * D * y = b by forward substitution */
 31:   xk = t;
 32:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 33:     idx = bs*r[k];
 34:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 35:   }
 36:   for (k=0; k<mbs; k++) {
 37:     v    = aa + bs2*ai[k];
 38:     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
 39:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
 40:     nz   = ai[k+1] - ai[k];
 41:     vj   = aj + ai[k];
 42:     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
 43:     while (nz--) {
 44:       /* x(:) += U(k,:)^T*(Dk*xk) */
 45:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 46:       vj++; xj = t + (*vj)*bs;
 47:       v       += bs2;
 48:     }
 49:     /* xk = inv(Dk)*(Dk*xk) */
 50:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 51:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 52:   }

 54:   /* solve U*x = y by back substitution */
 55:   for (k=mbs-1; k>=0; k--) {
 56:     v  = aa + bs2*ai[k];
 57:     xk = t + k*bs;        /* xk */
 58:     nz = ai[k+1] - ai[k];
 59:     vj = aj + ai[k];
 60:     xj = t + (*vj)*bs;
 61:     while (nz--) {
 62:       /* xk += U(k,:)*x(:) */
 63:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 64:       vj++;
 65:       v += bs2; xj = t + (*vj)*bs;
 66:     }
 67:     idx = bs*r[k];
 68:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 69:   }

 71:   PetscFree(xk_tmp);
 72:   ISRestoreIndices(isrow,&r);
 73:   VecRestoreArrayRead(bb,&b);
 74:   VecRestoreArray(xx,&x);
 75:   PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
 76:   return(0);
 77: }

 79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 80: {
 82:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
 83: }

 85: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 86: {
 88:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
 89: }

 91: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
 92: {
 93:   PetscErrorCode  ierr;
 94:   PetscInt        nz,k;
 95:   const PetscInt  *vj,bs2 = bs*bs;
 96:   const MatScalar *v,*diag;
 97:   PetscScalar     *xk,*xj,*xk_tmp;

100:   PetscMalloc1(bs,&xk_tmp);
101:   for (k=0; k<mbs; k++) {
102:     v    = aa + bs2*ai[k];
103:     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
104:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
105:     nz   = ai[k+1] - ai[k];
106:     vj   = aj + ai[k];
107:     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
108:     while (nz--) {
109:       /* x(:) += U(k,:)^T*(Dk*xk) */
110:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
111:       vj++; xj = x + (*vj)*bs;
112:       v       += bs2;
113:     }
114:     /* xk = inv(Dk)*(Dk*xk) */
115:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
116:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
117:   }
118:   PetscFree(xk_tmp);
119:   return(0);
120: }

122: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
123: {
124:   PetscInt        nz,k;
125:   const PetscInt  *vj,bs2 = bs*bs;
126:   const MatScalar *v;
127:   PetscScalar     *xk,*xj;

130:   for (k=mbs-1; k>=0; k--) {
131:     v  = aa + bs2*ai[k];
132:     xk = x + k*bs;        /* xk */
133:     nz = ai[k+1] - ai[k];
134:     vj = aj + ai[k];
135:     xj = x + (*vj)*bs;
136:     while (nz--) {
137:       /* xk += U(k,:)*x(:) */
138:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
139:       vj++;
140:       v += bs2; xj = x + (*vj)*bs;
141:     }
142:   }
143:   return(0);
144: }

146: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
147: {
148:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
149:   PetscErrorCode    ierr;
150:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
151:   PetscInt          bs =A->rmap->bs;
152:   const MatScalar   *aa=a->a;
153:   PetscScalar       *x;
154:   const PetscScalar *b;

157:   VecGetArrayRead(bb,&b);
158:   VecGetArray(xx,&x);

160:   /* solve U^T * D * y = b by forward substitution */
161:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
162:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

164:   /* solve U*x = y by back substitution */
165:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

167:   VecRestoreArrayRead(bb,&b);
168:   VecRestoreArray(xx,&x);
169:   PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
170:   return(0);
171: }

173: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
174: {
175:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
176:   PetscErrorCode    ierr;
177:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
178:   PetscInt          bs =A->rmap->bs;
179:   const MatScalar   *aa=a->a;
180:   const PetscScalar *b;
181:   PetscScalar       *x;

184:   VecGetArrayRead(bb,&b);
185:   VecGetArray(xx,&x);
186:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
187:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
188:   VecRestoreArrayRead(bb,&b);
189:   VecRestoreArray(xx,&x);
190:   PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
191:   return(0);
192: }

194: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
195: {
196:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
197:   PetscErrorCode    ierr;
198:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
199:   PetscInt          bs =A->rmap->bs;
200:   const MatScalar   *aa=a->a;
201:   const PetscScalar *b;
202:   PetscScalar       *x;

205:   VecGetArrayRead(bb,&b);
206:   VecGetArray(xx,&x);
207:   PetscArraycpy(x,b,bs*mbs);
208:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
209:   VecRestoreArrayRead(bb,&b);
210:   VecRestoreArray(xx,&x);
211:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
212:   return(0);
213: }

215: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
216: {
217:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
218:   IS                isrow=a->row;
219:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
220:   PetscErrorCode    ierr;
221:   PetscInt          nz,k,idx;
222:   const MatScalar   *aa=a->a,*v,*d;
223:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
224:   const PetscScalar *b;

227:   VecGetArrayRead(bb,&b);
228:   VecGetArray(xx,&x);
229:   t    = a->solve_work;
230:   ISGetIndices(isrow,&r);

232:   /* solve U^T * D * y = b by forward substitution */
233:   tp = t;
234:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
235:     idx   = 7*r[k];
236:     tp[0] = b[idx];
237:     tp[1] = b[idx+1];
238:     tp[2] = b[idx+2];
239:     tp[3] = b[idx+3];
240:     tp[4] = b[idx+4];
241:     tp[5] = b[idx+5];
242:     tp[6] = b[idx+6];
243:     tp   += 7;
244:   }

246:   for (k=0; k<mbs; k++) {
247:     v  = aa + 49*ai[k];
248:     vj = aj + ai[k];
249:     tp = t + k*7;
250:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
251:     nz = ai[k+1] - ai[k];
252:     tp = t + (*vj)*7;
253:     while (nz--) {
254:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
255:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
256:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
257:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
258:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
259:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
260:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
261:       vj++;
262:       tp = t + (*vj)*7;
263:       v += 49;
264:     }

266:     /* xk = inv(Dk)*(Dk*xk) */
267:     d     = aa+k*49;       /* ptr to inv(Dk) */
268:     tp    = t + k*7;
269:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
270:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
271:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
272:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
273:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
274:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
275:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
276:   }

278:   /* solve U*x = y by back substitution */
279:   for (k=mbs-1; k>=0; k--) {
280:     v  = aa + 49*ai[k];
281:     vj = aj + ai[k];
282:     tp = t + k*7;
283:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
284:     nz = ai[k+1] - ai[k];

286:     tp = t + (*vj)*7;
287:     while (nz--) {
288:       /* xk += U(k,:)*x(:) */
289:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
290:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
291:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
292:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
293:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
294:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
295:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
296:       vj++;
297:       tp = t + (*vj)*7;
298:       v += 49;
299:     }
300:     tp       = t + k*7;
301:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
302:     idx      = 7*r[k];
303:     x[idx]   = x0;
304:     x[idx+1] = x1;
305:     x[idx+2] = x2;
306:     x[idx+3] = x3;
307:     x[idx+4] = x4;
308:     x[idx+5] = x5;
309:     x[idx+6] = x6;
310:   }

312:   ISRestoreIndices(isrow,&r);
313:   VecRestoreArrayRead(bb,&b);
314:   VecRestoreArray(xx,&x);
315:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
316:   return(0);
317: }

319: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
320: {
321:   const MatScalar *v,*d;
322:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
323:   PetscInt        nz,k;
324:   const PetscInt  *vj;

327:   for (k=0; k<mbs; k++) {
328:     v  = aa + 49*ai[k];
329:     xp = x + k*7;
330:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
331:     nz = ai[k+1] - ai[k];
332:     vj = aj + ai[k];
333:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
334:     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
335:     while (nz--) {
336:       xp = x + (*vj)*7;
337:       /* x(:) += U(k,:)^T*(Dk*xk) */
338:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
339:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
340:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
341:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
342:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
343:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
344:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
345:       vj++;
346:       v += 49;
347:     }
348:     /* xk = inv(Dk)*(Dk*xk) */
349:     d     = aa+k*49;       /* ptr to inv(Dk) */
350:     xp    = x + k*7;
351:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
352:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
353:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
354:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
355:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
356:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
357:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
358:   }
359:   return(0);
360: }

362: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
363: {
364:   const MatScalar *v;
365:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
366:   PetscInt        nz,k;
367:   const PetscInt  *vj;

370:   for (k=mbs-1; k>=0; k--) {
371:     v  = aa + 49*ai[k];
372:     xp = x + k*7;
373:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
374:     nz = ai[k+1] - ai[k];
375:     vj = aj + ai[k];
376:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
377:     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
378:     while (nz--) {
379:       xp = x + (*vj)*7;
380:       /* xk += U(k,:)*x(:) */
381:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
382:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
383:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
384:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
385:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
386:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
387:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
388:       vj++;
389:       v += 49;
390:     }
391:     xp = x + k*7;
392:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
393:   }
394:   return(0);
395: }

397: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
398: {
399:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
400:   PetscErrorCode    ierr;
401:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
402:   const MatScalar   *aa=a->a;
403:   PetscScalar       *x;
404:   const PetscScalar *b;

407:   VecGetArrayRead(bb,&b);
408:   VecGetArray(xx,&x);

410:   /* solve U^T * D * y = b by forward substitution */
411:   PetscArraycpy(x,b,7*mbs); /* x <- b */
412:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

414:   /* solve U*x = y by back substitution */
415:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

417:   VecRestoreArrayRead(bb,&b);
418:   VecRestoreArray(xx,&x);
419:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
420:   return(0);
421: }

423: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
424: {
425:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
426:   PetscErrorCode    ierr;
427:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
428:   const MatScalar   *aa=a->a;
429:   PetscScalar       *x;
430:   const PetscScalar *b;

433:   VecGetArrayRead(bb,&b);
434:   VecGetArray(xx,&x);
435:   PetscArraycpy(x,b,7*mbs);
436:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
437:   VecRestoreArrayRead(bb,&b);
438:   VecRestoreArray(xx,&x);
439:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
440:   return(0);
441: }

443: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
444: {
445:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
446:   PetscErrorCode    ierr;
447:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
448:   const MatScalar   *aa=a->a;
449:   PetscScalar       *x;
450:   const PetscScalar *b;

453:   VecGetArrayRead(bb,&b);
454:   VecGetArray(xx,&x);
455:   PetscArraycpy(x,b,7*mbs);
456:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
457:   VecRestoreArrayRead(bb,&b);
458:   VecRestoreArray(xx,&x);
459:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
460:   return(0);
461: }

463: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
464: {
465:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
466:   IS                isrow=a->row;
467:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
468:   PetscErrorCode    ierr;
469:   PetscInt          nz,k,idx;
470:   const MatScalar   *aa=a->a,*v,*d;
471:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
472:   const PetscScalar *b;

475:   VecGetArrayRead(bb,&b);
476:   VecGetArray(xx,&x);
477:   t    = a->solve_work;
478:   ISGetIndices(isrow,&r);

480:   /* solve U^T * D * y = b by forward substitution */
481:   tp = t;
482:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
483:     idx   = 6*r[k];
484:     tp[0] = b[idx];
485:     tp[1] = b[idx+1];
486:     tp[2] = b[idx+2];
487:     tp[3] = b[idx+3];
488:     tp[4] = b[idx+4];
489:     tp[5] = b[idx+5];
490:     tp   += 6;
491:   }

493:   for (k=0; k<mbs; k++) {
494:     v  = aa + 36*ai[k];
495:     vj = aj + ai[k];
496:     tp = t + k*6;
497:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
498:     nz = ai[k+1] - ai[k];
499:     tp = t + (*vj)*6;
500:     while (nz--) {
501:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
502:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
503:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
504:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
505:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
506:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
507:       vj++;
508:       tp = t + (*vj)*6;
509:       v += 36;
510:     }

512:     /* xk = inv(Dk)*(Dk*xk) */
513:     d     = aa+k*36;       /* ptr to inv(Dk) */
514:     tp    = t + k*6;
515:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
516:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
517:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
518:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
519:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
520:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
521:   }

523:   /* solve U*x = y by back substitution */
524:   for (k=mbs-1; k>=0; k--) {
525:     v  = aa + 36*ai[k];
526:     vj = aj + ai[k];
527:     tp = t + k*6;
528:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
529:     nz = ai[k+1] - ai[k];

531:     tp = t + (*vj)*6;
532:     while (nz--) {
533:       /* xk += U(k,:)*x(:) */
534:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
535:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
536:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
537:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
538:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
539:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
540:       vj++;
541:       tp = t + (*vj)*6;
542:       v += 36;
543:     }
544:     tp       = t + k*6;
545:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
546:     idx      = 6*r[k];
547:     x[idx]   = x0;
548:     x[idx+1] = x1;
549:     x[idx+2] = x2;
550:     x[idx+3] = x3;
551:     x[idx+4] = x4;
552:     x[idx+5] = x5;
553:   }

555:   ISRestoreIndices(isrow,&r);
556:   VecRestoreArrayRead(bb,&b);
557:   VecRestoreArray(xx,&x);
558:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
559:   return(0);
560: }

562: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
563: {
564:   const MatScalar *v,*d;
565:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
566:   PetscInt        nz,k;
567:   const PetscInt  *vj;

570:   for (k=0; k<mbs; k++) {
571:     v  = aa + 36*ai[k];
572:     xp = x + k*6;
573:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
574:     nz = ai[k+1] - ai[k];
575:     vj = aj + ai[k];
576:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
577:     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
578:     while (nz--) {
579:       xp = x + (*vj)*6;
580:       /* x(:) += U(k,:)^T*(Dk*xk) */
581:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
582:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
583:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
584:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
585:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
586:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
587:       vj++;
588:       v += 36;
589:     }
590:     /* xk = inv(Dk)*(Dk*xk) */
591:     d     = aa+k*36;       /* ptr to inv(Dk) */
592:     xp    = x + k*6;
593:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
594:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
595:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
596:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
597:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
598:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
599:   }
600:   return(0);
601: }
602: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
603: {
604:   const MatScalar   *v;
605:   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
606:   PetscInt          nz,k;
607:   const PetscInt    *vj;

610:   for (k=mbs-1; k>=0; k--) {
611:     v  = aa + 36*ai[k];
612:     xp = x + k*6;
613:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
614:     nz = ai[k+1] - ai[k];
615:     vj = aj + ai[k];
616:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
617:     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
618:     while (nz--) {
619:       xp = x + (*vj)*6;
620:       /* xk += U(k,:)*x(:) */
621:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
622:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
623:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
624:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
625:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
626:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
627:       vj++;
628:       v += 36;
629:     }
630:     xp   = x + k*6;
631:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
632:   }
633:   return(0);
634: }

636: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
637: {
638:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
639:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
640:   const MatScalar   *aa=a->a;
641:   PetscScalar       *x;
642:   const PetscScalar *b;
643:   PetscErrorCode    ierr;

646:   VecGetArrayRead(bb,&b);
647:   VecGetArray(xx,&x);

649:   /* solve U^T * D * y = b by forward substitution */
650:   PetscArraycpy(x,b,6*mbs); /* x <- b */
651:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

653:   /* solve U*x = y by back substitution */
654:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

656:   VecRestoreArrayRead(bb,&b);
657:   VecRestoreArray(xx,&x);
658:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
659:   return(0);
660: }

662: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
663: {
664:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
665:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
666:   const MatScalar   *aa=a->a;
667:   PetscScalar       *x;
668:   const PetscScalar *b;
669:   PetscErrorCode    ierr;

672:   VecGetArrayRead(bb,&b);
673:   VecGetArray(xx,&x);
674:   PetscArraycpy(x,b,6*mbs); /* x <- b */
675:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
676:   VecRestoreArrayRead(bb,&b);
677:   VecRestoreArray(xx,&x);
678:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
679:   return(0);
680: }

682: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
683: {
684:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
685:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
686:   const MatScalar   *aa=a->a;
687:   PetscScalar       *x;
688:   const PetscScalar *b;
689:   PetscErrorCode    ierr;

692:   VecGetArrayRead(bb,&b);
693:   VecGetArray(xx,&x);
694:   PetscArraycpy(x,b,6*mbs); /* x <- b */
695:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
696:   VecRestoreArrayRead(bb,&b);
697:   VecRestoreArray(xx,&x);
698:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
699:   return(0);
700: }

702: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
703: {
704:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
705:   IS                isrow=a->row;
706:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
707:   PetscErrorCode    ierr;
708:   const PetscInt    *r,*vj;
709:   PetscInt          nz,k,idx;
710:   const MatScalar   *aa=a->a,*v,*diag;
711:   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
712:   const PetscScalar *b;

715:   VecGetArrayRead(bb,&b);
716:   VecGetArray(xx,&x);
717:   t    = a->solve_work;
718:   ISGetIndices(isrow,&r);

720:   /* solve U^T * D * y = b by forward substitution */
721:   tp = t;
722:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
723:     idx   = 5*r[k];
724:     tp[0] = b[idx];
725:     tp[1] = b[idx+1];
726:     tp[2] = b[idx+2];
727:     tp[3] = b[idx+3];
728:     tp[4] = b[idx+4];
729:     tp   += 5;
730:   }

732:   for (k=0; k<mbs; k++) {
733:     v  = aa + 25*ai[k];
734:     vj = aj + ai[k];
735:     tp = t + k*5;
736:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
737:     nz = ai[k+1] - ai[k];

739:     tp = t + (*vj)*5;
740:     while (nz--) {
741:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
742:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
743:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
744:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
745:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
746:       vj++;
747:       tp = t + (*vj)*5;
748:       v += 25;
749:     }

751:     /* xk = inv(Dk)*(Dk*xk) */
752:     diag  = aa+k*25;          /* ptr to inv(Dk) */
753:     tp    = t + k*5;
754:     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
755:     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
756:     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
757:     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
758:     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
759:   }

761:   /* solve U*x = y by back substitution */
762:   for (k=mbs-1; k>=0; k--) {
763:     v  = aa + 25*ai[k];
764:     vj = aj + ai[k];
765:     tp = t + k*5;
766:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
767:     nz = ai[k+1] - ai[k];

769:     tp = t + (*vj)*5;
770:     while (nz--) {
771:       /* xk += U(k,:)*x(:) */
772:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
773:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
774:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
775:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
776:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
777:       vj++;
778:       tp = t + (*vj)*5;
779:       v += 25;
780:     }
781:     tp       = t + k*5;
782:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
783:     idx      = 5*r[k];
784:     x[idx]   = x0;
785:     x[idx+1] = x1;
786:     x[idx+2] = x2;
787:     x[idx+3] = x3;
788:     x[idx+4] = x4;
789:   }

791:   ISRestoreIndices(isrow,&r);
792:   VecRestoreArrayRead(bb,&b);
793:   VecRestoreArray(xx,&x);
794:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
795:   return(0);
796: }

798: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
799: {
800:   const MatScalar *v,*diag;
801:   PetscScalar     *xp,x0,x1,x2,x3,x4;
802:   PetscInt        nz,k;
803:   const PetscInt  *vj;

806:   for (k=0; k<mbs; k++) {
807:     v  = aa + 25*ai[k];
808:     xp = x + k*5;
809:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
810:     nz = ai[k+1] - ai[k];
811:     vj = aj + ai[k];
812:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
813:     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
814:     while (nz--) {
815:       xp = x + (*vj)*5;
816:       /* x(:) += U(k,:)^T*(Dk*xk) */
817:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
818:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
819:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
820:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
821:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
822:       vj++;
823:       v += 25;
824:     }
825:     /* xk = inv(Dk)*(Dk*xk) */
826:     diag  = aa+k*25;         /* ptr to inv(Dk) */
827:     xp    = x + k*5;
828:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
829:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
830:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
831:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
832:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
833:   }
834:   return(0);
835: }

837: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
838: {
839:   const MatScalar *v;
840:   PetscScalar     *xp,x0,x1,x2,x3,x4;
841:   PetscInt        nz,k;
842:   const PetscInt  *vj;

845:   for (k=mbs-1; k>=0; k--) {
846:     v  = aa + 25*ai[k];
847:     xp = x + k*5;
848:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
849:     nz = ai[k+1] - ai[k];
850:     vj = aj + ai[k];
851:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
852:     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
853:     while (nz--) {
854:       xp = x + (*vj)*5;
855:       /* xk += U(k,:)*x(:) */
856:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
857:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
858:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
859:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
860:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
861:       vj++;
862:       v += 25;
863:     }
864:     xp   = x + k*5;
865:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
866:   }
867:   return(0);
868: }

870: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
871: {
872:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
873:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
874:   const MatScalar   *aa=a->a;
875:   PetscScalar       *x;
876:   const PetscScalar *b;
877:   PetscErrorCode    ierr;

880:   VecGetArrayRead(bb,&b);
881:   VecGetArray(xx,&x);

883:   /* solve U^T * D * y = b by forward substitution */
884:   PetscArraycpy(x,b,5*mbs); /* x <- b */
885:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

887:   /* solve U*x = y by back substitution */
888:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

890:   VecRestoreArrayRead(bb,&b);
891:   VecRestoreArray(xx,&x);
892:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
893:   return(0);
894: }

896: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
897: {
898:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
899:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
900:   const MatScalar   *aa=a->a;
901:   PetscScalar       *x;
902:   const PetscScalar *b;
903:   PetscErrorCode    ierr;

906:   VecGetArrayRead(bb,&b);
907:   VecGetArray(xx,&x);
908:   PetscArraycpy(x,b,5*mbs); /* x <- b */
909:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
910:   VecRestoreArrayRead(bb,&b);
911:   VecRestoreArray(xx,&x);
912:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
913:   return(0);
914: }

916: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
917: {
918:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
919:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
920:   const MatScalar   *aa=a->a;
921:   PetscScalar       *x;
922:   const PetscScalar *b;
923:   PetscErrorCode    ierr;

926:   VecGetArrayRead(bb,&b);
927:   VecGetArray(xx,&x);
928:   PetscArraycpy(x,b,5*mbs);
929:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
930:   VecRestoreArrayRead(bb,&b);
931:   VecRestoreArray(xx,&x);
932:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
933:   return(0);
934: }

936: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
937: {
938:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
939:   IS                isrow=a->row;
940:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
941:   PetscErrorCode    ierr;
942:   const PetscInt    *r,*vj;
943:   PetscInt          nz,k,idx;
944:   const MatScalar   *aa=a->a,*v,*diag;
945:   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
946:   const PetscScalar *b;

949:   VecGetArrayRead(bb,&b);
950:   VecGetArray(xx,&x);
951:   t    = a->solve_work;
952:   ISGetIndices(isrow,&r);

954:   /* solve U^T * D * y = b by forward substitution */
955:   tp = t;
956:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
957:     idx   = 4*r[k];
958:     tp[0] = b[idx];
959:     tp[1] = b[idx+1];
960:     tp[2] = b[idx+2];
961:     tp[3] = b[idx+3];
962:     tp   += 4;
963:   }

965:   for (k=0; k<mbs; k++) {
966:     v  = aa + 16*ai[k];
967:     vj = aj + ai[k];
968:     tp = t + k*4;
969:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
970:     nz = ai[k+1] - ai[k];

972:     tp = t + (*vj)*4;
973:     while (nz--) {
974:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
975:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
976:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
977:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
978:       vj++;
979:       tp = t + (*vj)*4;
980:       v += 16;
981:     }

983:     /* xk = inv(Dk)*(Dk*xk) */
984:     diag  = aa+k*16;          /* ptr to inv(Dk) */
985:     tp    = t + k*4;
986:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
987:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
988:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
989:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
990:   }

992:   /* solve U*x = y by back substitution */
993:   for (k=mbs-1; k>=0; k--) {
994:     v  = aa + 16*ai[k];
995:     vj = aj + ai[k];
996:     tp = t + k*4;
997:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
998:     nz = ai[k+1] - ai[k];

1000:     tp = t + (*vj)*4;
1001:     while (nz--) {
1002:       /* xk += U(k,:)*x(:) */
1003:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1004:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1005:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1006:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1007:       vj++;
1008:       tp = t + (*vj)*4;
1009:       v += 16;
1010:     }
1011:     tp       = t + k*4;
1012:     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1013:     idx      = 4*r[k];
1014:     x[idx]   = x0;
1015:     x[idx+1] = x1;
1016:     x[idx+2] = x2;
1017:     x[idx+3] = x3;
1018:   }

1020:   ISRestoreIndices(isrow,&r);
1021:   VecRestoreArrayRead(bb,&b);
1022:   VecRestoreArray(xx,&x);
1023:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1024:   return(0);
1025: }

1027: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1028: {
1029:   const MatScalar *v,*diag;
1030:   PetscScalar     *xp,x0,x1,x2,x3;
1031:   PetscInt        nz,k;
1032:   const PetscInt  *vj;

1035:   for (k=0; k<mbs; k++) {
1036:     v  = aa + 16*ai[k];
1037:     xp = x + k*4;
1038:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1039:     nz = ai[k+1] - ai[k];
1040:     vj = aj + ai[k];
1041:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1042:     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1043:     while (nz--) {
1044:       xp = x + (*vj)*4;
1045:       /* x(:) += U(k,:)^T*(Dk*xk) */
1046:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1047:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1048:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1049:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1050:       vj++;
1051:       v += 16;
1052:     }
1053:     /* xk = inv(Dk)*(Dk*xk) */
1054:     diag  = aa+k*16;         /* ptr to inv(Dk) */
1055:     xp    = x + k*4;
1056:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1057:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1058:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1059:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1060:   }
1061:   return(0);
1062: }

1064: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1065: {
1066:   const MatScalar *v;
1067:   PetscScalar     *xp,x0,x1,x2,x3;
1068:   PetscInt        nz,k;
1069:   const PetscInt  *vj;

1072:   for (k=mbs-1; k>=0; k--) {
1073:     v  = aa + 16*ai[k];
1074:     xp = x + k*4;
1075:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1076:     nz = ai[k+1] - ai[k];
1077:     vj = aj + ai[k];
1078:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1079:     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1080:     while (nz--) {
1081:       xp = x + (*vj)*4;
1082:       /* xk += U(k,:)*x(:) */
1083:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1084:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1085:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1086:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1087:       vj++;
1088:       v += 16;
1089:     }
1090:     xp    = x + k*4;
1091:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1092:   }
1093:   return(0);
1094: }

1096: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1097: {
1098:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1099:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1100:   const MatScalar   *aa=a->a;
1101:   PetscScalar       *x;
1102:   const PetscScalar *b;
1103:   PetscErrorCode    ierr;

1106:   VecGetArrayRead(bb,&b);
1107:   VecGetArray(xx,&x);

1109:   /* solve U^T * D * y = b by forward substitution */
1110:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1111:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);

1113:   /* solve U*x = y by back substitution */
1114:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1115:   VecRestoreArrayRead(bb,&b);
1116:   VecRestoreArray(xx,&x);
1117:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1118:   return(0);
1119: }

1121: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1122: {
1123:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1124:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1125:   const MatScalar   *aa=a->a;
1126:   PetscScalar       *x;
1127:   const PetscScalar *b;
1128:   PetscErrorCode    ierr;

1131:   VecGetArrayRead(bb,&b);
1132:   VecGetArray(xx,&x);
1133:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1134:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1135:   VecRestoreArrayRead(bb,&b);
1136:   VecRestoreArray(xx,&x);
1137:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1138:   return(0);
1139: }

1141: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1142: {
1143:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1144:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1145:   const MatScalar   *aa=a->a;
1146:   PetscScalar       *x;
1147:   const PetscScalar *b;
1148:   PetscErrorCode    ierr;

1151:   VecGetArrayRead(bb,&b);
1152:   VecGetArray(xx,&x);
1153:   PetscArraycpy(x,b,4*mbs);
1154:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1155:   VecRestoreArrayRead(bb,&b);
1156:   VecRestoreArray(xx,&x);
1157:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1158:   return(0);
1159: }

1161: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1162: {
1163:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1164:   IS                isrow=a->row;
1165:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1166:   PetscErrorCode    ierr;
1167:   const PetscInt    *r;
1168:   PetscInt          nz,k,idx;
1169:   const PetscInt    *vj;
1170:   const MatScalar   *aa=a->a,*v,*diag;
1171:   PetscScalar       *x,x0,x1,x2,*t,*tp;
1172:   const PetscScalar *b;

1175:   VecGetArrayRead(bb,&b);
1176:   VecGetArray(xx,&x);
1177:   t    = a->solve_work;
1178:   ISGetIndices(isrow,&r);

1180:   /* solve U^T * D * y = b by forward substitution */
1181:   tp = t;
1182:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1183:     idx   = 3*r[k];
1184:     tp[0] = b[idx];
1185:     tp[1] = b[idx+1];
1186:     tp[2] = b[idx+2];
1187:     tp   += 3;
1188:   }

1190:   for (k=0; k<mbs; k++) {
1191:     v  = aa + 9*ai[k];
1192:     vj = aj + ai[k];
1193:     tp = t + k*3;
1194:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1195:     nz = ai[k+1] - ai[k];

1197:     tp = t + (*vj)*3;
1198:     while (nz--) {
1199:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1200:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1201:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1202:       vj++;
1203:       tp = t + (*vj)*3;
1204:       v += 9;
1205:     }

1207:     /* xk = inv(Dk)*(Dk*xk) */
1208:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1209:     tp    = t + k*3;
1210:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1211:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1212:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1213:   }

1215:   /* solve U*x = y by back substitution */
1216:   for (k=mbs-1; k>=0; k--) {
1217:     v  = aa + 9*ai[k];
1218:     vj = aj + ai[k];
1219:     tp = t + k*3;
1220:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1221:     nz = ai[k+1] - ai[k];

1223:     tp = t + (*vj)*3;
1224:     while (nz--) {
1225:       /* xk += U(k,:)*x(:) */
1226:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1227:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1228:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1229:       vj++;
1230:       tp = t + (*vj)*3;
1231:       v += 9;
1232:     }
1233:     tp       = t + k*3;
1234:     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1235:     idx      = 3*r[k];
1236:     x[idx]   = x0;
1237:     x[idx+1] = x1;
1238:     x[idx+2] = x2;
1239:   }

1241:   ISRestoreIndices(isrow,&r);
1242:   VecRestoreArrayRead(bb,&b);
1243:   VecRestoreArray(xx,&x);
1244:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1245:   return(0);
1246: }

1248: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1249: {
1250:   const MatScalar *v,*diag;
1251:   PetscScalar     *xp,x0,x1,x2;
1252:   PetscInt        nz,k;
1253:   const PetscInt  *vj;

1256:   for (k=0; k<mbs; k++) {
1257:     v  = aa + 9*ai[k];
1258:     xp = x + k*3;
1259:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1260:     nz = ai[k+1] - ai[k];
1261:     vj = aj + ai[k];
1262:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1263:     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1264:     while (nz--) {
1265:       xp = x + (*vj)*3;
1266:       /* x(:) += U(k,:)^T*(Dk*xk) */
1267:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1268:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1269:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1270:       vj++;
1271:       v += 9;
1272:     }
1273:     /* xk = inv(Dk)*(Dk*xk) */
1274:     diag  = aa+k*9;         /* ptr to inv(Dk) */
1275:     xp    = x + k*3;
1276:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1277:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1278:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1279:   }
1280:   return(0);
1281: }

1283: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1284: {
1285:   const MatScalar *v;
1286:   PetscScalar     *xp,x0,x1,x2;
1287:   PetscInt        nz,k;
1288:   const PetscInt  *vj;

1291:   for (k=mbs-1; k>=0; k--) {
1292:     v  = aa + 9*ai[k];
1293:     xp = x + k*3;
1294:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1295:     nz = ai[k+1] - ai[k];
1296:     vj = aj + ai[k];
1297:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1298:     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1299:     while (nz--) {
1300:       xp = x + (*vj)*3;
1301:       /* xk += U(k,:)*x(:) */
1302:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1303:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1304:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1305:       vj++;
1306:       v += 9;
1307:     }
1308:     xp    = x + k*3;
1309:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1310:   }
1311:   return(0);
1312: }

1314: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1315: {
1316:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1317:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1318:   const MatScalar   *aa=a->a;
1319:   PetscScalar       *x;
1320:   const PetscScalar *b;
1321:   PetscErrorCode    ierr;

1324:   VecGetArrayRead(bb,&b);
1325:   VecGetArray(xx,&x);

1327:   /* solve U^T * D * y = b by forward substitution */
1328:   PetscArraycpy(x,b,3*mbs);
1329:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1331:   /* solve U*x = y by back substitution */
1332:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1334:   VecRestoreArrayRead(bb,&b);
1335:   VecRestoreArray(xx,&x);
1336:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1337:   return(0);
1338: }

1340: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1341: {
1342:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1343:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1344:   const MatScalar   *aa=a->a;
1345:   PetscScalar       *x;
1346:   const PetscScalar *b;
1347:   PetscErrorCode    ierr;

1350:   VecGetArrayRead(bb,&b);
1351:   VecGetArray(xx,&x);
1352:   PetscArraycpy(x,b,3*mbs);
1353:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1354:   VecRestoreArrayRead(bb,&b);
1355:   VecRestoreArray(xx,&x);
1356:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1357:   return(0);
1358: }

1360: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1361: {
1362:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1363:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1364:   const MatScalar   *aa=a->a;
1365:   PetscScalar       *x;
1366:   const PetscScalar *b;
1367:   PetscErrorCode    ierr;

1370:   VecGetArrayRead(bb,&b);
1371:   VecGetArray(xx,&x);
1372:   PetscArraycpy(x,b,3*mbs);
1373:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1374:   VecRestoreArrayRead(bb,&b);
1375:   VecRestoreArray(xx,&x);
1376:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1377:   return(0);
1378: }

1380: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1381: {
1382:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1383:   IS                isrow=a->row;
1384:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1385:   PetscErrorCode    ierr;
1386:   const PetscInt    *r,*vj;
1387:   PetscInt          nz,k,k2,idx;
1388:   const MatScalar   *aa=a->a,*v,*diag;
1389:   PetscScalar       *x,x0,x1,*t;
1390:   const PetscScalar *b;

1393:   VecGetArrayRead(bb,&b);
1394:   VecGetArray(xx,&x);
1395:   t    = a->solve_work;
1396:   ISGetIndices(isrow,&r);

1398:   /* solve U^T * D * y = perm(b) by forward substitution */
1399:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1400:     idx      = 2*r[k];
1401:     t[k*2]   = b[idx];
1402:     t[k*2+1] = b[idx+1];
1403:   }
1404:   for (k=0; k<mbs; k++) {
1405:     v  = aa + 4*ai[k];
1406:     vj = aj + ai[k];
1407:     k2 = k*2;
1408:     x0 = t[k2]; x1 = t[k2+1];
1409:     nz = ai[k+1] - ai[k];
1410:     while (nz--) {
1411:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1412:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1413:       vj++; v      += 4;
1414:     }
1415:     diag    = aa+k*4; /* ptr to inv(Dk) */
1416:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1417:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1418:   }

1420:   /* solve U*x = y by back substitution */
1421:   for (k=mbs-1; k>=0; k--) {
1422:     v  = aa + 4*ai[k];
1423:     vj = aj + ai[k];
1424:     k2 = k*2;
1425:     x0 = t[k2]; x1 = t[k2+1];
1426:     nz = ai[k+1] - ai[k];
1427:     while (nz--) {
1428:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1429:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1430:       vj++;
1431:       v += 4;
1432:     }
1433:     t[k2]    = x0;
1434:     t[k2+1]  = x1;
1435:     idx      = 2*r[k];
1436:     x[idx]   = x0;
1437:     x[idx+1] = x1;
1438:   }

1440:   ISRestoreIndices(isrow,&r);
1441:   VecRestoreArrayRead(bb,&b);
1442:   VecRestoreArray(xx,&x);
1443:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1444:   return(0);
1445: }

1447: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1448: {
1449:   const MatScalar *v,*diag;
1450:   PetscScalar     x0,x1;
1451:   PetscInt        nz,k,k2;
1452:   const PetscInt  *vj;

1455:   for (k=0; k<mbs; k++) {
1456:     v  = aa + 4*ai[k];
1457:     vj = aj + ai[k];
1458:     k2 = k*2;
1459:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1460:     nz = ai[k+1] - ai[k];
1461:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1462:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1463:     while (nz--) {
1464:       /* x(:) += U(k,:)^T*(Dk*xk) */
1465:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1466:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1467:       vj++; v      += 4;
1468:     }
1469:     /* xk = inv(Dk)*(Dk*xk) */
1470:     diag    = aa+k*4;       /* ptr to inv(Dk) */
1471:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1472:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1473:   }
1474:   return(0);
1475: }

1477: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1478: {
1479:   const MatScalar *v;
1480:   PetscScalar     x0,x1;
1481:   PetscInt        nz,k,k2;
1482:   const PetscInt  *vj;

1485:   for (k=mbs-1; k>=0; k--) {
1486:     v  = aa + 4*ai[k];
1487:     vj = aj + ai[k];
1488:     k2 = k*2;
1489:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1490:     nz = ai[k+1] - ai[k];
1491:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1492:     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1493:     while (nz--) {
1494:       /* xk += U(k,:)*x(:) */
1495:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1496:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1497:       vj++;
1498:       v += 4;
1499:     }
1500:     x[k2]   = x0;
1501:     x[k2+1] = x1;
1502:   }
1503:   return(0);
1504: }

1506: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1507: {
1508:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1509:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1510:   const MatScalar   *aa=a->a;
1511:   PetscScalar       *x;
1512:   const PetscScalar *b;
1513:   PetscErrorCode    ierr;

1516:   VecGetArrayRead(bb,&b);
1517:   VecGetArray(xx,&x);

1519:   /* solve U^T * D * y = b by forward substitution */
1520:   PetscArraycpy(x,b,2*mbs);
1521:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1523:   /* solve U*x = y by back substitution */
1524:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1526:   VecRestoreArrayRead(bb,&b);
1527:   VecRestoreArray(xx,&x);
1528:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1529:   return(0);
1530: }

1532: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1533: {
1534:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1535:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1536:   const MatScalar   *aa=a->a;
1537:   PetscScalar       *x;
1538:   const PetscScalar *b;
1539:   PetscErrorCode    ierr;

1542:   VecGetArrayRead(bb,&b);
1543:   VecGetArray(xx,&x);
1544:   PetscArraycpy(x,b,2*mbs);
1545:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1546:   VecRestoreArrayRead(bb,&b);
1547:   VecRestoreArray(xx,&x);
1548:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1549:   return(0);
1550: }

1552: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1553: {
1554:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1555:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1556:   const MatScalar   *aa=a->a;
1557:   PetscScalar       *x;
1558:   const PetscScalar *b;
1559:   PetscErrorCode    ierr;

1562:   VecGetArrayRead(bb,&b);
1563:   VecGetArray(xx,&x);
1564:   PetscArraycpy(x,b,2*mbs);
1565:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1566:   VecRestoreArrayRead(bb,&b);
1567:   VecRestoreArray(xx,&x);
1568:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1569:   return(0);
1570: }

1572: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1573: {
1574:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1575:   IS                isrow=a->row;
1576:   PetscErrorCode    ierr;
1577:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1578:   const MatScalar   *aa=a->a,*v;
1579:   const PetscScalar *b;
1580:   PetscScalar       *x,xk,*t;
1581:   PetscInt          nz,k,j;

1584:   VecGetArrayRead(bb,&b);
1585:   VecGetArray(xx,&x);
1586:   t    = a->solve_work;
1587:   ISGetIndices(isrow,&rp);

1589:   /* solve U^T*D*y = perm(b) by forward substitution */
1590:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1591:   for (k=0; k<mbs; k++) {
1592:     v  = aa + ai[k];
1593:     vj = aj + ai[k];
1594:     xk = t[k];
1595:     nz = ai[k+1] - ai[k] - 1;
1596:     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1597:     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1598:   }

1600:   /* solve U*perm(x) = y by back substitution */
1601:   for (k=mbs-1; k>=0; k--) {
1602:     v  = aa + adiag[k] - 1;
1603:     vj = aj + adiag[k] - 1;
1604:     nz = ai[k+1] - ai[k] - 1;
1605:     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1606:     x[rp[k]] = t[k];
1607:   }

1609:   ISRestoreIndices(isrow,&rp);
1610:   VecRestoreArrayRead(bb,&b);
1611:   VecRestoreArray(xx,&x);
1612:   PetscLogFlops(4.0*a->nz - 3.0*mbs);
1613:   return(0);
1614: }

1616: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1617: {
1618:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1619:   IS                isrow=a->row;
1620:   PetscErrorCode    ierr;
1621:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1622:   const MatScalar   *aa=a->a,*v;
1623:   PetscScalar       *x,xk,*t;
1624:   const PetscScalar *b;
1625:   PetscInt          nz,k;

1628:   VecGetArrayRead(bb,&b);
1629:   VecGetArray(xx,&x);
1630:   t    = a->solve_work;
1631:   ISGetIndices(isrow,&rp);

1633:   /* solve U^T*D*y = perm(b) by forward substitution */
1634:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1635:   for (k=0; k<mbs; k++) {
1636:     v  = aa + ai[k] + 1;
1637:     vj = aj + ai[k] + 1;
1638:     xk = t[k];
1639:     nz = ai[k+1] - ai[k] - 1;
1640:     while (nz--) t[*vj++] += (*v++) * xk;
1641:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1642:   }

1644:   /* solve U*perm(x) = y by back substitution */
1645:   for (k=mbs-1; k>=0; k--) {
1646:     v  = aa + ai[k] + 1;
1647:     vj = aj + ai[k] + 1;
1648:     nz = ai[k+1] - ai[k] - 1;
1649:     while (nz--) t[k] += (*v++) * t[*vj++];
1650:     x[rp[k]] = t[k];
1651:   }

1653:   ISRestoreIndices(isrow,&rp);
1654:   VecRestoreArrayRead(bb,&b);
1655:   VecRestoreArray(xx,&x);
1656:   PetscLogFlops(4.0*a->nz - 3*mbs);
1657:   return(0);
1658: }

1660: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1661: {
1662:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1663:   IS                isrow=a->row;
1664:   PetscErrorCode    ierr;
1665:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1666:   const MatScalar   *aa=a->a,*v;
1667:   PetscReal         diagk;
1668:   PetscScalar       *x,xk;
1669:   const PetscScalar *b;
1670:   PetscInt          nz,k;

1673:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1674:   VecGetArrayRead(bb,&b);
1675:   VecGetArray(xx,&x);
1676:   ISGetIndices(isrow,&rp);

1678:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1679:   for (k=0; k<mbs; k++) {
1680:     v  = aa + ai[k];
1681:     vj = aj + ai[k];
1682:     xk = x[k];
1683:     nz = ai[k+1] - ai[k] - 1;
1684:     while (nz--) x[*vj++] += (*v++) * xk;

1686:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1687:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1688:     x[k] = xk*PetscSqrtReal(diagk);
1689:   }
1690:   ISRestoreIndices(isrow,&rp);
1691:   VecRestoreArrayRead(bb,&b);
1692:   VecRestoreArray(xx,&x);
1693:   PetscLogFlops(2.0*a->nz - mbs);
1694:   return(0);
1695: }

1697: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1698: {
1699:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1700:   IS                isrow=a->row;
1701:   PetscErrorCode    ierr;
1702:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1703:   const MatScalar   *aa=a->a,*v;
1704:   PetscReal         diagk;
1705:   PetscScalar       *x,xk;
1706:   const PetscScalar *b;
1707:   PetscInt          nz,k;

1710:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1711:   VecGetArrayRead(bb,&b);
1712:   VecGetArray(xx,&x);
1713:   ISGetIndices(isrow,&rp);

1715:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1716:   for (k=0; k<mbs; k++) {
1717:     v  = aa + ai[k] + 1;
1718:     vj = aj + ai[k] + 1;
1719:     xk = x[k];
1720:     nz = ai[k+1] - ai[k] - 1;
1721:     while (nz--) x[*vj++] += (*v++) * xk;

1723:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1724:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1725:     x[k] = xk*PetscSqrtReal(diagk);
1726:   }
1727:   ISRestoreIndices(isrow,&rp);
1728:   VecRestoreArrayRead(bb,&b);
1729:   VecRestoreArray(xx,&x);
1730:   PetscLogFlops(2.0*a->nz - mbs);
1731:   return(0);
1732: }

1734: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1735: {
1736:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1737:   IS                isrow=a->row;
1738:   PetscErrorCode    ierr;
1739:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1740:   const MatScalar   *aa=a->a,*v;
1741:   PetscReal         diagk;
1742:   PetscScalar       *x,*t;
1743:   const PetscScalar *b;
1744:   PetscInt          nz,k;

1747:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1748:   VecGetArrayRead(bb,&b);
1749:   VecGetArray(xx,&x);
1750:   t    = a->solve_work;
1751:   ISGetIndices(isrow,&rp);

1753:   for (k=mbs-1; k>=0; k--) {
1754:     v     = aa + ai[k];
1755:     vj    = aj + ai[k];
1756:     diagk = PetscRealPart(aa[adiag[k]]);
1757:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1758:     t[k] = b[k] * PetscSqrtReal(diagk);
1759:     nz   = ai[k+1] - ai[k] - 1;
1760:     while (nz--) t[k] += (*v++) * t[*vj++];
1761:     x[rp[k]] = t[k];
1762:   }
1763:   ISRestoreIndices(isrow,&rp);
1764:   VecRestoreArrayRead(bb,&b);
1765:   VecRestoreArray(xx,&x);
1766:   PetscLogFlops(2.0*a->nz - mbs);
1767:   return(0);
1768: }

1770: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1771: {
1772:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1773:   IS                isrow=a->row;
1774:   PetscErrorCode    ierr;
1775:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1776:   const MatScalar   *aa=a->a,*v;
1777:   PetscReal         diagk;
1778:   PetscScalar       *x,*t;
1779:   const PetscScalar *b;
1780:   PetscInt          nz,k;

1783:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1784:   VecGetArrayRead(bb,&b);
1785:   VecGetArray(xx,&x);
1786:   t    = a->solve_work;
1787:   ISGetIndices(isrow,&rp);

1789:   for (k=mbs-1; k>=0; k--) {
1790:     v     = aa + ai[k] + 1;
1791:     vj    = aj + ai[k] + 1;
1792:     diagk = PetscRealPart(aa[ai[k]]);
1793:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1794:     t[k] = b[k] * PetscSqrtReal(diagk);
1795:     nz   = ai[k+1] - ai[k] - 1;
1796:     while (nz--) t[k] += (*v++) * t[*vj++];
1797:     x[rp[k]] = t[k];
1798:   }
1799:   ISRestoreIndices(isrow,&rp);
1800:   VecRestoreArrayRead(bb,&b);
1801:   VecRestoreArray(xx,&x);
1802:   PetscLogFlops(2.0*a->nz - mbs);
1803:   return(0);
1804: }

1806: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1807: {
1808:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1812:   if (A->rmap->bs == 1) {
1813:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1814:   } else {
1815:     IS                isrow=a->row;
1816:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1817:     const MatScalar   *aa=a->a,*v;
1818:     PetscScalar       *x,*t;
1819:     const PetscScalar *b;
1820:     PetscInt          nz,k,n,i,j;

1822:     if (bb->n > a->solves_work_n) {
1823:       PetscFree(a->solves_work);
1824:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1825:       a->solves_work_n = bb->n;
1826:     }
1827:     n    = bb->n;
1828:     VecGetArrayRead(bb->v,&b);
1829:     VecGetArray(xx->v,&x);
1830:     t    = a->solves_work;

1832:     ISGetIndices(isrow,&rp);

1834:     /* solve U^T*D*y = perm(b) by forward substitution */
1835:     for (k=0; k<mbs; k++) {
1836:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1837:     }
1838:     for (k=0; k<mbs; k++) {
1839:       v  = aa + ai[k];
1840:       vj = aj + ai[k];
1841:       nz = ai[k+1] - ai[k] - 1;
1842:       for (j=0; j<nz; j++) {
1843:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1844:         v++;vj++;
1845:       }
1846:       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1847:     }

1849:     /* solve U*perm(x) = y by back substitution */
1850:     for (k=mbs-1; k>=0; k--) {
1851:       v  = aa + ai[k] - 1;
1852:       vj = aj + ai[k] - 1;
1853:       nz = ai[k+1] - ai[k] - 1;
1854:       for (j=0; j<nz; j++) {
1855:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1856:         v++;vj++;
1857:       }
1858:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1859:     }

1861:     ISRestoreIndices(isrow,&rp);
1862:     VecRestoreArrayRead(bb->v,&b);
1863:     VecRestoreArray(xx->v,&x);
1864:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1865:   }
1866:   return(0);
1867: }

1869: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1870: {
1871:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1875:   if (A->rmap->bs == 1) {
1876:     MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1877:   } else {
1878:     IS                isrow=a->row;
1879:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1880:     const MatScalar   *aa=a->a,*v;
1881:     PetscScalar       *x,*t;
1882:     const PetscScalar *b;
1883:     PetscInt          nz,k,n,i;

1885:     if (bb->n > a->solves_work_n) {
1886:       PetscFree(a->solves_work);
1887:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1888:       a->solves_work_n = bb->n;
1889:     }
1890:     n    = bb->n;
1891:     VecGetArrayRead(bb->v,&b);
1892:     VecGetArray(xx->v,&x);
1893:     t    = a->solves_work;

1895:     ISGetIndices(isrow,&rp);

1897:     /* solve U^T*D*y = perm(b) by forward substitution */
1898:     for (k=0; k<mbs; k++) {
1899:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1900:     }
1901:     for (k=0; k<mbs; k++) {
1902:       v  = aa + ai[k];
1903:       vj = aj + ai[k];
1904:       nz = ai[k+1] - ai[k];
1905:       while (nz--) {
1906:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1907:         v++;vj++;
1908:       }
1909:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1910:     }

1912:     /* solve U*perm(x) = y by back substitution */
1913:     for (k=mbs-1; k>=0; k--) {
1914:       v  = aa + ai[k];
1915:       vj = aj + ai[k];
1916:       nz = ai[k+1] - ai[k];
1917:       while (nz--) {
1918:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1919:         v++;vj++;
1920:       }
1921:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1922:     }

1924:     ISRestoreIndices(isrow,&rp);
1925:     VecRestoreArrayRead(bb->v,&b);
1926:     VecRestoreArray(xx->v,&x);
1927:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1928:   }
1929:   return(0);
1930: }

1932: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1933: {
1934:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1935:   PetscErrorCode    ierr;
1936:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1937:   const MatScalar   *aa=a->a,*v;
1938:   const PetscScalar *b;
1939:   PetscScalar       *x,xi;
1940:   PetscInt          nz,i,j;

1943:   VecGetArrayRead(bb,&b);
1944:   VecGetArray(xx,&x);
1945:   /* solve U^T*D*y = b by forward substitution */
1946:   PetscArraycpy(x,b,mbs);
1947:   for (i=0; i<mbs; i++) {
1948:     v  = aa + ai[i];
1949:     vj = aj + ai[i];
1950:     xi = x[i];
1951:     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1952:     for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1953:     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1954:   }
1955:   /* solve U*x = y by backward substitution */
1956:   for (i=mbs-2; i>=0; i--) {
1957:     xi = x[i];
1958:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1959:     vj = aj + adiag[i] - 1;
1960:     nz = ai[i+1] - ai[i] - 1;
1961:     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1962:     x[i] = xi;
1963:   }
1964:   VecRestoreArrayRead(bb,&b);
1965:   VecRestoreArray(xx,&x);
1966:   PetscLogFlops(4.0*a->nz - 3*mbs);
1967:   return(0);
1968: }

1970: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1971: {
1972:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1973:   PetscErrorCode    ierr;
1974:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1975:   const MatScalar   *aa=a->a,*v;
1976:   const PetscScalar *b;
1977:   PetscScalar       *x,xi;
1978:   PetscInt          nz,i,j,neq,ldb,ldx;
1979:   PetscBool         isdense;

1982:   if (!mbs) return(0);
1983:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);
1984:   if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1985:   if (X != B) {
1986:     PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);
1987:     if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1988:   }
1989:   MatDenseGetArrayRead(B,&b);
1990:   MatDenseGetLDA(B,&ldb);
1991:   MatDenseGetArray(X,&x);
1992:   MatDenseGetLDA(X,&ldx);
1993:   for (neq=0; neq<B->cmap->n; neq++) {
1994:     /* solve U^T*D*y = b by forward substitution */
1995:     PetscArraycpy(x,b,mbs);
1996:     for (i=0; i<mbs; i++) {
1997:       v  = aa + ai[i];
1998:       vj = aj + ai[i];
1999:       xi = x[i];
2000:       nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2001:       for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
2002:       x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2003:     }
2004:     /* solve U*x = y by backward substitution */
2005:     for (i=mbs-2; i>=0; i--) {
2006:       xi = x[i];
2007:       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2008:       vj = aj + adiag[i] - 1;
2009:       nz = ai[i+1] - ai[i] - 1;
2010:       for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2011:       x[i] = xi;
2012:     }
2013:     b += ldb;
2014:     x += ldx;
2015:   }
2016:   MatDenseRestoreArrayRead(B,&b);
2017:   MatDenseRestoreArray(X,&x);
2018:   PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));
2019:   return(0);
2020: }

2022: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2023: {
2024:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2025:   PetscErrorCode    ierr;
2026:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2027:   const MatScalar   *aa=a->a,*v;
2028:   PetscScalar       *x,xk;
2029:   const PetscScalar *b;
2030:   PetscInt          nz,k;

2033:   VecGetArrayRead(bb,&b);
2034:   VecGetArray(xx,&x);

2036:   /* solve U^T*D*y = b by forward substitution */
2037:   PetscArraycpy(x,b,mbs);
2038:   for (k=0; k<mbs; k++) {
2039:     v  = aa + ai[k] + 1;
2040:     vj = aj + ai[k] + 1;
2041:     xk = x[k];
2042:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2043:     while (nz--) x[*vj++] += (*v++) * xk;
2044:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2045:   }

2047:   /* solve U*x = y by back substitution */
2048:   for (k=mbs-2; k>=0; k--) {
2049:     v  = aa + ai[k] + 1;
2050:     vj = aj + ai[k] + 1;
2051:     xk = x[k];
2052:     nz = ai[k+1] - ai[k] - 1;
2053:     while (nz--) {
2054:       xk += (*v++) * x[*vj++];
2055:     }
2056:     x[k] = xk;
2057:   }

2059:   VecRestoreArrayRead(bb,&b);
2060:   VecRestoreArray(xx,&x);
2061:   PetscLogFlops(4.0*a->nz - 3*mbs);
2062:   return(0);
2063: }

2065: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2066: {
2067:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2068:   PetscErrorCode    ierr;
2069:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2070:   const MatScalar   *aa=a->a,*v;
2071:   PetscReal         diagk;
2072:   PetscScalar       *x;
2073:   const PetscScalar *b;
2074:   PetscInt          nz,k;

2077:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2078:   VecGetArrayRead(bb,&b);
2079:   VecGetArray(xx,&x);
2080:   PetscArraycpy(x,b,mbs);
2081:   for (k=0; k<mbs; k++) {
2082:     v  = aa + ai[k];
2083:     vj = aj + ai[k];
2084:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2085:     while (nz--) x[*vj++] += (*v++) * x[k];
2086:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2087:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2088:     x[k] *= PetscSqrtReal(diagk);
2089:   }
2090:   VecRestoreArrayRead(bb,&b);
2091:   VecRestoreArray(xx,&x);
2092:   PetscLogFlops(2.0*a->nz - mbs);
2093:   return(0);
2094: }

2096: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2097: {
2098:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2099:   PetscErrorCode    ierr;
2100:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2101:   const MatScalar   *aa=a->a,*v;
2102:   PetscReal         diagk;
2103:   PetscScalar       *x;
2104:   const PetscScalar *b;
2105:   PetscInt          nz,k;

2108:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2109:   VecGetArrayRead(bb,&b);
2110:   VecGetArray(xx,&x);
2111:   PetscArraycpy(x,b,mbs);
2112:   for (k=0; k<mbs; k++) {
2113:     v  = aa + ai[k] + 1;
2114:     vj = aj + ai[k] + 1;
2115:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2116:     while (nz--) x[*vj++] += (*v++) * x[k];
2117:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2118:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2119:     x[k] *= PetscSqrtReal(diagk);
2120:   }
2121:   VecRestoreArrayRead(bb,&b);
2122:   VecRestoreArray(xx,&x);
2123:   PetscLogFlops(2.0*a->nz - mbs);
2124:   return(0);
2125: }

2127: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2128: {
2129:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2130:   PetscErrorCode    ierr;
2131:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2132:   const MatScalar   *aa=a->a,*v;
2133:   PetscReal         diagk;
2134:   PetscScalar       *x;
2135:   const PetscScalar *b;
2136:   PetscInt          nz,k;

2139:   /* solve D^(1/2)*U*x = b by back substitution */
2140:   VecGetArrayRead(bb,&b);
2141:   VecGetArray(xx,&x);

2143:   for (k=mbs-1; k>=0; k--) {
2144:     v     = aa + ai[k];
2145:     vj    = aj + ai[k];
2146:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2147:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2148:     x[k] = PetscSqrtReal(diagk)*b[k];
2149:     nz   = ai[k+1] - ai[k] - 1;
2150:     while (nz--) x[k] += (*v++) * x[*vj++];
2151:   }
2152:   VecRestoreArrayRead(bb,&b);
2153:   VecRestoreArray(xx,&x);
2154:   PetscLogFlops(2.0*a->nz - mbs);
2155:   return(0);
2156: }

2158: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2159: {
2160:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2161:   PetscErrorCode    ierr;
2162:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2163:   const MatScalar   *aa=a->a,*v;
2164:   PetscReal         diagk;
2165:   PetscScalar       *x;
2166:   const PetscScalar *b;
2167:   PetscInt          nz,k;

2170:   /* solve D^(1/2)*U*x = b by back substitution */
2171:   VecGetArrayRead(bb,&b);
2172:   VecGetArray(xx,&x);

2174:   for (k=mbs-1; k>=0; k--) {
2175:     v     = aa + ai[k] + 1;
2176:     vj    = aj + ai[k] + 1;
2177:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2178:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2179:     x[k] = PetscSqrtReal(diagk)*b[k];
2180:     nz   = ai[k+1] - ai[k] - 1;
2181:     while (nz--) x[k] += (*v++) * x[*vj++];
2182:   }
2183:   VecRestoreArrayRead(bb,&b);
2184:   VecRestoreArray(xx,&x);
2185:   PetscLogFlops(2.0*a->nz - mbs);
2186:   return(0);
2187: }

2189: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2190: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2191: {
2192:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2194:   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2195:   PetscInt       *jutmp,bs = A->rmap->bs,i;
2196:   PetscInt       m,reallocs = 0,*levtmp;
2197:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2198:   PetscInt       incrlev,*lev,shift,prow,nz;
2199:   PetscReal      f = info->fill,levels = info->levels;
2200:   PetscBool      perm_identity;

2203:   /* check whether perm is the identity mapping */
2204:   ISIdentity(perm,&perm_identity);

2206:   if (perm_identity) {
2207:     a->permute = PETSC_FALSE;
2208:     ai         = a->i; aj = a->j;
2209:   } else { /*  non-trivial permutation */
2210:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2211:   }

2213:   /* initialization */
2214:   ISGetIndices(perm,&rip);
2215:   umax  = (PetscInt)(f*ai[mbs] + 1);
2216:   PetscMalloc1(umax,&lev);
2217:   umax += mbs + 1;
2218:   shift = mbs + 1;
2219:   PetscMalloc1(mbs+1,&iu);
2220:   PetscMalloc1(umax,&ju);
2221:   iu[0] = mbs + 1;
2222:   juidx = mbs + 1;
2223:   /* prowl: linked list for pivot row */
2224:   PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2225:   /* q: linked list for col index */

2227:   for (i=0; i<mbs; i++) {
2228:     prowl[i] = mbs;
2229:     q[i]     = 0;
2230:   }

2232:   /* for each row k */
2233:   for (k=0; k<mbs; k++) {
2234:     nzk  = 0;
2235:     q[k] = mbs;
2236:     /* copy current row into linked list */
2237:     nz = ai[rip[k]+1] - ai[rip[k]];
2238:     j  = ai[rip[k]];
2239:     while (nz--) {
2240:       vj = rip[aj[j++]];
2241:       if (vj > k) {
2242:         qm = k;
2243:         do {
2244:           m = qm; qm = q[m];
2245:         } while (qm < vj);
2246:         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2247:         nzk++;
2248:         q[m]       = vj;
2249:         q[vj]      = qm;
2250:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2251:       }
2252:     }

2254:     /* modify nonzero structure of k-th row by computing fill-in
2255:        for each row prow to be merged in */
2256:     prow = k;
2257:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2259:     while (prow < k) {
2260:       /* merge row prow into k-th row */
2261:       jmin = iu[prow] + 1;
2262:       jmax = iu[prow+1];
2263:       qm   = k;
2264:       for (j=jmin; j<jmax; j++) {
2265:         incrlev = lev[j-shift] + 1;
2266:         if (incrlev > levels) continue;
2267:         vj = ju[j];
2268:         do {
2269:           m = qm; qm = q[m];
2270:         } while (qm < vj);
2271:         if (qm != vj) {      /* a fill */
2272:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2273:           levtmp[vj] = incrlev;
2274:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2275:       }
2276:       prow = prowl[prow]; /* next pivot row */
2277:     }

2279:     /* add k to row list for first nonzero element in k-th row */
2280:     if (nzk > 1) {
2281:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2282:       prowl[k] = prowl[i]; prowl[i] = k;
2283:     }
2284:     iu[k+1] = iu[k] + nzk;

2286:     /* allocate more space to ju and lev if needed */
2287:     if (iu[k+1] > umax) {
2288:       /* estimate how much additional space we will need */
2289:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2290:       /* just double the memory each time */
2291:       maxadd = umax;
2292:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2293:       umax += maxadd;

2295:       /* allocate a longer ju */
2296:       PetscMalloc1(umax,&jutmp);
2297:       PetscArraycpy(jutmp,ju,iu[k]);
2298:       PetscFree(ju);
2299:       ju   = jutmp;

2301:       PetscMalloc1(umax,&jutmp);
2302:       PetscArraycpy(jutmp,lev,iu[k]-shift);
2303:       PetscFree(lev);
2304:       lev       = jutmp;
2305:       reallocs += 2; /* count how many times we realloc */
2306:     }

2308:     /* save nonzero structure of k-th row in ju */
2309:     i=k;
2310:     while (nzk--) {
2311:       i                = q[i];
2312:       ju[juidx]        = i;
2313:       lev[juidx-shift] = levtmp[i];
2314:       juidx++;
2315:     }
2316:   }

2318: #if defined(PETSC_USE_INFO)
2319:   if (ai[mbs] != 0) {
2320:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2321:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2322:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2323:     PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2324:     PetscInfo(A,"for best performance.\n");
2325:   } else {
2326:     PetscInfo(A,"Empty matrix\n");
2327:   }
2328: #endif

2330:   ISRestoreIndices(perm,&rip);
2331:   PetscFree3(prowl,q,levtmp);
2332:   PetscFree(lev);

2334:   /* put together the new matrix */
2335:   MatSeqSBAIJSetPreallocation(B,bs,0,NULL);

2337:   /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2338:   b    = (Mat_SeqSBAIJ*)(B)->data;
2339:   PetscFree2(b->imax,b->ilen);

2341:   b->singlemalloc = PETSC_FALSE;
2342:   b->free_a       = PETSC_TRUE;
2343:   b->free_ij      = PETSC_TRUE;
2344:   /* the next line frees the default space generated by the Create() */
2345:   PetscFree3(b->a,b->j,b->i);
2346:   PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2347:   b->j    = ju;
2348:   b->i    = iu;
2349:   b->diag = NULL;
2350:   b->ilen = NULL;
2351:   b->imax = NULL;

2353:   ISDestroy(&b->row);
2354:   ISDestroy(&b->icol);
2355:   b->row  = perm;
2356:   b->icol = perm;
2357:   PetscObjectReference((PetscObject)perm);
2358:   PetscObjectReference((PetscObject)perm);
2359:   PetscMalloc1(bs*mbs+bs,&b->solve_work);
2360:   /* In b structure:  Free imax, ilen, old a, old j.
2361:      Allocate idnew, solve_work, new a, new j */
2362:   PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2363:   b->maxnz = b->nz = iu[mbs];

2365:   (B)->info.factor_mallocs   = reallocs;
2366:   (B)->info.fill_ratio_given = f;
2367:   if (ai[mbs] != 0) {
2368:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2369:   } else {
2370:     (B)->info.fill_ratio_needed = 0.0;
2371:   }
2372:   MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2373:   return(0);
2374: }

2376: /*
2377:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2378: */
2379: #include <petscbt.h>
2380: #include <../src/mat/utils/freespace.h>
2381: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2382: {
2383:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2384:   PetscErrorCode     ierr;
2385:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2386:   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2387:   const PetscInt     *rip;
2388:   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2389:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2390:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2391:   PetscReal          fill          =info->fill,levels=info->levels;
2392:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2393:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2394:   PetscBT            lnkbt;

2397:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2398:   MatMissingDiagonal(A,&missing,&d);
2399:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2400:   if (bs > 1) {
2401:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2402:     return(0);
2403:   }

2405:   /* check whether perm is the identity mapping */
2406:   ISIdentity(perm,&perm_identity);
2407:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2408:   a->permute = PETSC_FALSE;

2410:   PetscMalloc1(am+1,&ui);
2411:   PetscMalloc1(am+1,&udiag);
2412:   ui[0] = 0;

2414:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2415:   if (!levels) {
2416:     /* reuse the column pointers and row offsets for memory savings */
2417:     for (i=0; i<am; i++) {
2418:       ncols    = ai[i+1] - ai[i];
2419:       ui[i+1]  = ui[i] + ncols;
2420:       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2421:     }
2422:     PetscMalloc1(ui[am]+1,&uj);
2423:     cols = uj;
2424:     for (i=0; i<am; i++) {
2425:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2426:       ncols = ai[i+1] - ai[i] -1;
2427:       for (j=0; j<ncols; j++) *cols++ = aj[j];
2428:       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2429:     }
2430:   } else { /* case: levels>0 */
2431:     ISGetIndices(perm,&rip);

2433:     /* initialization */
2434:     /* jl: linked list for storing indices of the pivot rows
2435:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2436:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2437:     for (i=0; i<am; i++) {
2438:       jl[i] = am; il[i] = 0;
2439:     }

2441:     /* create and initialize a linked list for storing column indices of the active row k */
2442:     nlnk = am + 1;
2443:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2445:     /* initial FreeSpace size is fill*(ai[am]+1) */
2446:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2448:     current_space = free_space;

2450:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2452:     current_space_lvl = free_space_lvl;

2454:     for (k=0; k<am; k++) {  /* for each active row k */
2455:       /* initialize lnk by the column indices of row k */
2456:       nzk   = 0;
2457:       ncols = ai[k+1] - ai[k];
2458:       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2459:       cols = aj+ai[k];
2460:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2461:       nzk += nlnk;

2463:       /* update lnk by computing fill-in for each pivot row to be merged in */
2464:       prow = jl[k]; /* 1st pivot row */

2466:       while (prow < k) {
2467:         nextprow = jl[prow];

2469:         /* merge prow into k-th row */
2470:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2471:         jmax  = ui[prow+1];
2472:         ncols = jmax-jmin;
2473:         i     = jmin - ui[prow];

2475:         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2476:         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2477:         j    = *(uj - 1);
2478:         PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2479:         nzk += nlnk;

2481:         /* update il and jl for prow */
2482:         if (jmin < jmax) {
2483:           il[prow] = jmin;

2485:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2486:         }
2487:         prow = nextprow;
2488:       }

2490:       /* if free space is not available, make more free space */
2491:       if (current_space->local_remaining<nzk) {
2492:         i    = am - k + 1; /* num of unfactored rows */
2493:         i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2494:         PetscFreeSpaceGet(i,&current_space);
2495:         PetscFreeSpaceGet(i,&current_space_lvl);
2496:         reallocs++;
2497:       }

2499:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2500:       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2501:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2503:       /* add the k-th row into il and jl */
2504:       if (nzk > 1) {
2505:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2506:         jl[k] = jl[i]; jl[i] = k;
2507:         il[k] = ui[k] + 1;
2508:       }
2509:       uj_ptr[k]     = current_space->array;
2510:       uj_lvl_ptr[k] = current_space_lvl->array;

2512:       current_space->array               += nzk;
2513:       current_space->local_used          += nzk;
2514:       current_space->local_remaining     -= nzk;
2515:       current_space_lvl->array           += nzk;
2516:       current_space_lvl->local_used      += nzk;
2517:       current_space_lvl->local_remaining -= nzk;

2519:       ui[k+1] = ui[k] + nzk;
2520:     }

2522:     ISRestoreIndices(perm,&rip);
2523:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2525:     /* destroy list of free space and other temporary array(s) */
2526:     PetscMalloc1(ui[am]+1,&uj);
2527:     PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor  */
2528:     PetscIncompleteLLDestroy(lnk,lnkbt);
2529:     PetscFreeSpaceDestroy(free_space_lvl);

2531:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2533:   /* put together the new matrix in MATSEQSBAIJ format */
2534:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2536:   b    = (Mat_SeqSBAIJ*)(fact)->data;
2537:   PetscFree2(b->imax,b->ilen);

2539:   b->singlemalloc = PETSC_FALSE;
2540:   b->free_a       = PETSC_TRUE;
2541:   b->free_ij      = free_ij;

2543:   PetscMalloc1(ui[am]+1,&b->a);

2545:   b->j         = uj;
2546:   b->i         = ui;
2547:   b->diag      = udiag;
2548:   b->free_diag = PETSC_TRUE;
2549:   b->ilen      = NULL;
2550:   b->imax      = NULL;
2551:   b->row       = perm;
2552:   b->col       = perm;

2554:   PetscObjectReference((PetscObject)perm);
2555:   PetscObjectReference((PetscObject)perm);

2557:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2559:   PetscMalloc1(am+1,&b->solve_work);
2560:   PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));

2562:   b->maxnz = b->nz = ui[am];

2564:   fact->info.factor_mallocs   = reallocs;
2565:   fact->info.fill_ratio_given = fill;
2566:   if (ai[am] != 0) {
2567:     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2568:   } else {
2569:     fact->info.fill_ratio_needed = 0.0;
2570:   }
2571: #if defined(PETSC_USE_INFO)
2572:   if (ai[am] != 0) {
2573:     PetscReal af = fact->info.fill_ratio_needed;
2574:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2575:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2576:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2577:   } else {
2578:     PetscInfo(A,"Empty matrix\n");
2579:   }
2580: #endif
2581:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2582:   return(0);
2583: }

2585: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2586: {
2587:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2588:   Mat_SeqSBAIJ       *b;
2589:   PetscErrorCode     ierr;
2590:   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2591:   PetscInt           bs=A->rmap->bs,am=a->mbs;
2592:   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2593:   PetscInt           reallocs=0,i,*ui;
2594:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2595:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2596:   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2597:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2598:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2599:   PetscBT            lnkbt;

2602:   /*
2603:    This code originally uses Modified Sparse Row (MSR) storage
2604:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2605:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2606:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2607:    thus the original code in MSR format is still used for these cases.
2608:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2609:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2610:   */
2611:   if (bs > 1) {
2612:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2613:     return(0);
2614:   }

2616:   /* check whether perm is the identity mapping */
2617:   ISIdentity(perm,&perm_identity);
2618:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2619:   a->permute = PETSC_FALSE;

2621:   /* special case that simply copies fill pattern */
2622:   if (!levels) {
2623:     /* reuse the column pointers and row offsets for memory savings */
2624:     ui           = a->i;
2625:     uj           = a->j;
2626:     free_ij      = PETSC_FALSE;
2627:     ratio_needed = 1.0;
2628:   } else { /* case: levels>0 */
2629:     ISGetIndices(perm,&rip);

2631:     /* initialization */
2632:     PetscMalloc1(am+1,&ui);
2633:     ui[0] = 0;

2635:     /* jl: linked list for storing indices of the pivot rows
2636:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2637:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2638:     for (i=0; i<am; i++) {
2639:       jl[i] = am; il[i] = 0;
2640:     }

2642:     /* create and initialize a linked list for storing column indices of the active row k */
2643:     nlnk = am + 1;
2644:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2646:     /* initial FreeSpace size is fill*(ai[am]+1) */
2647:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2649:     current_space = free_space;

2651:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2653:     current_space_lvl = free_space_lvl;

2655:     for (k=0; k<am; k++) {  /* for each active row k */
2656:       /* initialize lnk by the column indices of row rip[k] */
2657:       nzk   = 0;
2658:       ncols = ai[rip[k]+1] - ai[rip[k]];
2659:       cols  = aj+ai[rip[k]];
2660:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2661:       nzk  += nlnk;

2663:       /* update lnk by computing fill-in for each pivot row to be merged in */
2664:       prow = jl[k]; /* 1st pivot row */

2666:       while (prow < k) {
2667:         nextprow = jl[prow];

2669:         /* merge prow into k-th row */
2670:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2671:         jmax     = ui[prow+1];
2672:         ncols    = jmax-jmin;
2673:         i        = jmin - ui[prow];
2674:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2675:         j        = *(uj_lvl_ptr[prow] + i - 1);
2676:         cols_lvl = uj_lvl_ptr[prow]+i;
2677:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2678:         nzk     += nlnk;

2680:         /* update il and jl for prow */
2681:         if (jmin < jmax) {
2682:           il[prow] = jmin;
2683:           j        = *cols;
2684:           jl[prow] = jl[j];
2685:           jl[j]    = prow;
2686:         }
2687:         prow = nextprow;
2688:       }

2690:       /* if free space is not available, make more free space */
2691:       if (current_space->local_remaining<nzk) {
2692:         i    = am - k + 1; /* num of unfactored rows */
2693:         i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2694:         PetscFreeSpaceGet(i,&current_space);
2695:         PetscFreeSpaceGet(i,&current_space_lvl);
2696:         reallocs++;
2697:       }

2699:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2700:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2702:       /* add the k-th row into il and jl */
2703:       if (nzk-1 > 0) {
2704:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2705:         jl[k] = jl[i]; jl[i] = k;
2706:         il[k] = ui[k] + 1;
2707:       }
2708:       uj_ptr[k]     = current_space->array;
2709:       uj_lvl_ptr[k] = current_space_lvl->array;

2711:       current_space->array               += nzk;
2712:       current_space->local_used          += nzk;
2713:       current_space->local_remaining     -= nzk;
2714:       current_space_lvl->array           += nzk;
2715:       current_space_lvl->local_used      += nzk;
2716:       current_space_lvl->local_remaining -= nzk;

2718:       ui[k+1] = ui[k] + nzk;
2719:     }

2721:     ISRestoreIndices(perm,&rip);
2722:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2724:     /* destroy list of free space and other temporary array(s) */
2725:     PetscMalloc1(ui[am]+1,&uj);
2726:     PetscFreeSpaceContiguous(&free_space,uj);
2727:     PetscIncompleteLLDestroy(lnk,lnkbt);
2728:     PetscFreeSpaceDestroy(free_space_lvl);
2729:     if (ai[am] != 0) {
2730:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2731:     } else {
2732:       ratio_needed = 0.0;
2733:     }
2734:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2736:   /* put together the new matrix in MATSEQSBAIJ format */
2737:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2739:   b = (Mat_SeqSBAIJ*)(fact)->data;

2741:   PetscFree2(b->imax,b->ilen);

2743:   b->singlemalloc = PETSC_FALSE;
2744:   b->free_a       = PETSC_TRUE;
2745:   b->free_ij      = free_ij;

2747:   PetscMalloc1(ui[am]+1,&b->a);

2749:   b->j             = uj;
2750:   b->i             = ui;
2751:   b->diag          = NULL;
2752:   b->ilen          = NULL;
2753:   b->imax          = NULL;
2754:   b->row           = perm;
2755:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2757:   PetscObjectReference((PetscObject)perm);
2758:   b->icol = perm;
2759:   PetscObjectReference((PetscObject)perm);
2760:   PetscMalloc1(am+1,&b->solve_work);

2762:   b->maxnz = b->nz = ui[am];

2764:   fact->info.factor_mallocs    = reallocs;
2765:   fact->info.fill_ratio_given  = fill;
2766:   fact->info.fill_ratio_needed = ratio_needed;
2767: #if defined(PETSC_USE_INFO)
2768:   if (ai[am] != 0) {
2769:     PetscReal af = fact->info.fill_ratio_needed;
2770:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2771:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2772:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2773:   } else {
2774:     PetscInfo(A,"Empty matrix\n");
2775:   }
2776: #endif
2777:   if (perm_identity) {
2778:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2779:   } else {
2780:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2781:   }
2782:   return(0);
2783: }