Actual source code: bvlapack.c

slepc-3.19.2 2023-09-05
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV private kernels that use the LAPACK
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include <slepcblaslapack.h>

 17: /*
 18:     Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
 19: */
 20: SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
 21: {
 22:   PetscBLASInt i,n=*len;
 23:   PetscReal    *x = (PetscReal*)in,*y = (PetscReal*)inout;

 25:   PetscFunctionBegin;
 26:   if (PetscUnlikely(*datatype!=MPIU_REAL)) {
 27:     (void)(*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
 28:     MPI_Abort(PETSC_COMM_WORLD,1);
 29:   }
 30:   for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
 31:   PetscFunctionReturnVoid();
 32: }

 34: /*
 35:     Compute ||A|| for an mxn matrix
 36: */
 37: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
 38: {
 39:   PetscBLASInt   m,n,i,j;
 40:   PetscMPIInt    len;
 41:   PetscReal      lnrm,*rwork=NULL,*rwork2=NULL;

 43:   PetscFunctionBegin;
 44:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
 45:   PetscCall(PetscBLASIntCast(m_,&m));
 46:   PetscCall(PetscBLASIntCast(n_,&n));
 47:   if (type==NORM_FROBENIUS || type==NORM_2) {
 48:     lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
 49:     if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
 50:     else *nrm = lnrm;
 51:     PetscCall(PetscLogFlops(2.0*m*n));
 52:   } else if (type==NORM_1) {
 53:     if (mpi) {
 54:       PetscCall(BVAllocateWork_Private(bv,2*n_));
 55:       rwork = (PetscReal*)bv->work;
 56:       rwork2 = rwork+n_;
 57:       PetscCall(PetscArrayzero(rwork,n_));
 58:       PetscCall(PetscArrayzero(rwork2,n_));
 59:       for (j=0;j<n_;j++) {
 60:         for (i=0;i<m_;i++) {
 61:           rwork[j] += PetscAbsScalar(A[i+j*m_]);
 62:         }
 63:       }
 64:       PetscCall(PetscMPIIntCast(n_,&len));
 65:       PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
 66:       *nrm = 0.0;
 67:       for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
 68:     } else {
 69:       *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
 70:     }
 71:     PetscCall(PetscLogFlops(1.0*m*n));
 72:   } else if (type==NORM_INFINITY) {
 73:     PetscCall(BVAllocateWork_Private(bv,m_));
 74:     rwork = (PetscReal*)bv->work;
 75:     lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
 76:     if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv)));
 77:     else *nrm = lnrm;
 78:     PetscCall(PetscLogFlops(1.0*m*n));
 79:   }
 80:   PetscCall(PetscFPTrapPop());
 81:   PetscFunctionReturn(PETSC_SUCCESS);
 82: }

 84: /*
 85:     Normalize the columns of an mxn matrix A
 86: */
 87: PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscScalar *eigi,PetscBool mpi)
 88: {
 89:   PetscBLASInt   m,j,k,info,zero=0;
 90:   PetscMPIInt    len;
 91:   PetscReal      *norms,*rwork=NULL,*rwork2=NULL,done=1.0;

 93:   PetscFunctionBegin;
 94:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
 95:   PetscCall(PetscBLASIntCast(m_,&m));
 96:   PetscCall(BVAllocateWork_Private(bv,2*n_));
 97:   rwork = (PetscReal*)bv->work;
 98:   rwork2 = rwork+n_;
 99:   /* compute local norms */
100:   for (j=0;j<n_;j++) {
101:     k = 1;
102: #if !defined(PETSC_USE_COMPLEX)
103:     if (eigi && eigi[j] != 0.0) k = 2;
104: #endif
105:     rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*m_),&m,rwork2);
106:     if (k==2) { rwork[j+1] = rwork[j]; j++; }
107:   }
108:   /* reduction to get global norms */
109:   if (mpi) {
110:     PetscCall(PetscMPIIntCast(n_,&len));
111:     PetscCall(PetscArrayzero(rwork2,n_));
112:     PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
113:     norms = rwork2;
114:   } else norms = rwork;
115:   /* scale columns */
116:   for (j=0;j<n_;j++) {
117:     k = 1;
118: #if !defined(PETSC_USE_COMPLEX)
119:     if (eigi && eigi[j] != 0.0) k = 2;
120: #endif
121:     PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*m_),&m,&info));
122:     SlepcCheckLapackInfo("lascl",info);
123:     if (k==2) j++;
124:   }
125:   PetscCall(PetscLogFlops(3.0*m*n_));
126:   PetscCall(PetscFPTrapPop());
127:   PetscFunctionReturn(PETSC_SUCCESS);
128: }

130: /*
131:    Compute the upper Cholesky factor in R and its inverse in S.
132:    If S == R then the inverse overwrites the Cholesky factor.
133:  */
134: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
135: {
136:   PetscInt       i,k,l,n,m,ld,lds;
137:   PetscScalar    *pR,*pS;
138:   PetscBLASInt   info,n_ = 0,m_ = 0,ld_,lds_;

140:   PetscFunctionBegin;
141:   l = bv->l;
142:   k = bv->k;
143:   PetscCall(MatGetSize(R,&m,NULL));
144:   n = k-l;
145:   PetscCall(PetscBLASIntCast(m,&m_));
146:   PetscCall(PetscBLASIntCast(n,&n_));
147:   ld  = m;
148:   ld_ = m_;
149:   PetscCall(MatDenseGetArray(R,&pR));

151:   if (S==R) {
152:     PetscCall(BVAllocateWork_Private(bv,m*k));
153:     pS = bv->work;
154:     lds = ld;
155:     lds_ = ld_;
156:   } else {
157:     PetscCall(MatDenseGetArray(S,&pS));
158:     PetscCall(MatGetSize(S,&lds,NULL));
159:     PetscCall(PetscBLASIntCast(lds,&lds_));
160:   }

162:   /* save a copy of matrix in S */
163:   for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));

165:   /* compute upper Cholesky factor in R */
166:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
167:   PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
168:   PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));

170:   if (info) {  /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
171:     for (i=l;i<k;i++) {
172:       PetscCall(PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n));
173:       pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
174:     }
175:     PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
176:     SlepcCheckLapackInfo("potrf",info);
177:     PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
178:   }

180:   /* compute S = inv(R) */
181:   if (S==R) {
182:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
183:   } else {
184:     PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
185:     for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
186:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
187:   }
188:   SlepcCheckLapackInfo("trtri",info);
189:   PetscCall(PetscFPTrapPop());
190:   PetscCall(PetscLogFlops(0.33*n*n*n));

192:   /* Zero out entries below the diagonal */
193:   for (i=l;i<k-1;i++) {
194:     PetscCall(PetscArrayzero(pR+i*ld+i+1,(k-i-1)));
195:     if (S!=R) PetscCall(PetscArrayzero(pS+i*lds+i+1,(k-i-1)));
196:   }
197:   PetscCall(MatDenseRestoreArray(R,&pR));
198:   if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
199:   PetscFunctionReturn(PETSC_SUCCESS);
200: }

202: /*
203:    Compute the inverse of an upper triangular matrix R, store it in S.
204:    If S == R then the inverse overwrites R.
205:  */
206: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
207: {
208:   PetscInt       i,k,l,n,m,ld,lds;
209:   PetscScalar    *pR,*pS;
210:   PetscBLASInt   info,n_,m_ = 0,ld_,lds_;

212:   PetscFunctionBegin;
213:   l = bv->l;
214:   k = bv->k;
215:   PetscCall(MatGetSize(R,&m,NULL));
216:   n = k-l;
217:   PetscCall(PetscBLASIntCast(m,&m_));
218:   PetscCall(PetscBLASIntCast(n,&n_));
219:   ld  = m;
220:   ld_ = m_;
221:   PetscCall(MatDenseGetArray(R,&pR));

223:   if (S==R) {
224:     PetscCall(BVAllocateWork_Private(bv,m*k));
225:     pS = bv->work;
226:     lds = ld;
227:     lds_ = ld_;
228:   } else {
229:     PetscCall(MatDenseGetArray(S,&pS));
230:     PetscCall(MatGetSize(S,&lds,NULL));
231:     PetscCall(PetscBLASIntCast(lds,&lds_));
232:   }

234:   /* compute S = inv(R) */
235:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
236:   if (S==R) {
237:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
238:   } else {
239:     PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
240:     for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
241:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
242:   }
243:   SlepcCheckLapackInfo("trtri",info);
244:   PetscCall(PetscFPTrapPop());
245:   PetscCall(PetscLogFlops(0.33*n*n*n));

247:   PetscCall(MatDenseRestoreArray(R,&pR));
248:   if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
249:   PetscFunctionReturn(PETSC_SUCCESS);
250: }

252: /*
253:    Compute the matrix to be used for post-multiplying the basis in the SVQB
254:    block orthogonalization method.
255:    On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
256:    the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
257:    If S == R then the result overwrites R.
258:  */
259: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
260: {
261:   PetscInt       i,j,k,l,n,m,ld,lds;
262:   PetscScalar    *pR,*pS,*D,*work,a;
263:   PetscReal      *eig,dummy;
264:   PetscBLASInt   info,lwork,n_,m_ = 0,ld_,lds_;
265: #if defined(PETSC_USE_COMPLEX)
266:   PetscReal      *rwork,rdummy;
267: #endif

269:   PetscFunctionBegin;
270:   l = bv->l;
271:   k = bv->k;
272:   PetscCall(MatGetSize(R,&m,NULL));
273:   PetscCall(MatDenseGetLDA(R,&ld));
274:   n = k-l;
275:   PetscCall(PetscBLASIntCast(m,&m_));
276:   PetscCall(PetscBLASIntCast(n,&n_));
277:   ld_ = m_;
278:   PetscCall(MatDenseGetArray(R,&pR));

280:   if (S==R) {
281:     pS = pR;
282:     lds = ld;
283:     lds_ = ld_;
284:   } else {
285:     PetscCall(MatDenseGetArray(S,&pS));
286:     PetscCall(MatDenseGetLDA(S,&lds));
287:     PetscCall(PetscBLASIntCast(lds,&lds_));
288:   }
289:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));

291:   /* workspace query and memory allocation */
292:   lwork = -1;
293: #if defined(PETSC_USE_COMPLEX)
294:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
295:   PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork));
296:   PetscCall(PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork));
297: #else
298:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
299:   PetscCall(PetscBLASIntCast((PetscInt)a,&lwork));
300:   PetscCall(PetscMalloc3(n,&eig,n,&D,lwork,&work));
301: #endif

303:   /* copy and scale matrix */
304:   for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
305:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
306:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];

308:   /* compute eigendecomposition */
309: #if defined(PETSC_USE_COMPLEX)
310:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
311: #else
312:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
313: #endif
314:   SlepcCheckLapackInfo("syev",info);

316:   if (S!=R) {   /* R = U' */
317:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
318:   }

320:   /* compute S = D*U*Lambda^{-1/2} */
321:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
322:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);

324:   if (S!=R) {   /* compute R = inv(S) = Lambda^{1/2}*U'/D */
325:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
326:     for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
327:   }

329: #if defined(PETSC_USE_COMPLEX)
330:   PetscCall(PetscFree4(eig,D,work,rwork));
331: #else
332:   PetscCall(PetscFree3(eig,D,work));
333: #endif
334:   PetscCall(PetscLogFlops(9.0*n*n*n));
335:   PetscCall(PetscFPTrapPop());

337:   PetscCall(MatDenseRestoreArray(R,&pR));
338:   if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
339:   PetscFunctionReturn(PETSC_SUCCESS);
340: }

342: /*
343:     QR factorization of an mxn matrix via parallel TSQR
344: */
345: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
346: {
347:   PetscInt       level,plevel,nlevels,powtwo,lda,worklen;
348:   PetscBLASInt   m,n,i,j,k,l,s = 0,nb,sz,lwork,info;
349:   PetscScalar    *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
350:   PetscMPIInt    rank,size,count,stride;
351:   MPI_Datatype   tmat;

353:   PetscFunctionBegin;
354:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
355:   PetscCall(PetscBLASIntCast(m_,&m));
356:   PetscCall(PetscBLASIntCast(n_,&n));
357:   k  = PetscMin(m,n);
358:   nb = 16;
359:   lda = 2*n;
360:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
361:   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank));
362:   nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
363:   powtwo  = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
364:   worklen = n+n*nb;
365:   if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
366:   PetscCall(BVAllocateWork_Private(bv,worklen));
367:   tau  = bv->work;
368:   work = bv->work+n;
369:   PetscCall(PetscBLASIntCast(n*nb,&lwork));
370:   if (nlevels) {
371:     A  = bv->work+n+n*nb;
372:     QQ = bv->work+n+n*nb+n*lda;
373:     C  = bv->work+n+n*nb+n*lda+n*lda*nlevels;
374:   }

376:   /* Compute QR */
377:   PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
378:   SlepcCheckLapackInfo("geqrf",info);

380:   /* Extract R */
381:   if (R || nlevels) {
382:     for (j=0;j<n;j++) {
383:       for (i=0;i<=PetscMin(j,m-1);i++) {
384:         if (nlevels) A[i+j*lda] = Q[i+j*m];
385:         else R[i+j*ldr] = Q[i+j*m];
386:       }
387:       for (i=PetscMin(j,m-1)+1;i<n;i++) {
388:         if (nlevels) A[i+j*lda] = 0.0;
389:         else R[i+j*ldr] = 0.0;
390:       }
391:     }
392:   }

394:   /* Compute orthogonal matrix in Q */
395:   PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&m,tau,work,&lwork,&info));
396:   SlepcCheckLapackInfo("orgqr",info);

398:   if (nlevels) {

400:     PetscCall(PetscMPIIntCast(n,&count));
401:     PetscCall(PetscMPIIntCast(lda,&stride));
402:     PetscCall(PetscBLASIntCast(lda,&l));
403:     PetscCallMPI(MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat));
404:     PetscCallMPI(MPI_Type_commit(&tmat));

406:     for (level=nlevels;level>=1;level--) {

408:       plevel = PetscPowInt(2,level);
409:       PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));

411:       /* Stack triangular matrices */
412:       if (rank<s && s<size) {  /* send top part, receive bottom part */
413:         PetscCallMPI(MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
414:       } else if (s<size) {  /* copy top to bottom, receive top part */
415:         PetscCallMPI(MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
416:         PetscCallMPI(MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
417:       }
418:       if (level<nlevels && size!=powtwo) {  /* for cases when size is not a power of 2 */
419:         if (rank<size-powtwo) {  /* send bottom part */
420:           PetscCallMPI(MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv)));
421:         } else if (rank>=powtwo) {  /* receive bottom part */
422:           PetscCallMPI(MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
423:         }
424:       }
425:       /* Compute QR and build orthogonal matrix */
426:       if (level<nlevels || (level==nlevels && s<size)) {
427:         PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
428:         SlepcCheckLapackInfo("geqrf",info);
429:         PetscCall(PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda));
430:         PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
431:         SlepcCheckLapackInfo("orgqr",info);
432:         for (j=0;j<n;j++) {
433:           for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
434:         }
435:       } else if (level==nlevels) {  /* only one triangular matrix, set Q=I */
436:         PetscCall(PetscArrayzero(QQ+(level-1)*n*lda,n*lda));
437:         for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
438:       }
439:     }

441:     /* Extract R */
442:     if (R) {
443:       for (j=0;j<n;j++) {
444:         for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
445:         for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
446:       }
447:     }

449:     /* Accumulate orthogonal matrices */
450:     for (level=1;level<=nlevels;level++) {
451:       plevel = PetscPowInt(2,level);
452:       PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
453:       Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
454:       if (level<nlevels) {
455:         PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
456:         PetscCall(PetscArraycpy(QQ+level*n*lda,C,n*lda));
457:       } else {
458:         for (i=0;i<m/l;i++) {
459:           PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&m,Qhalf,&l,&zero,C,&l));
460:           for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+i*l+j*m,C+j*l,l));
461:         }
462:         sz = m%l;
463:         if (sz) {
464:           PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&m,Qhalf,&l,&zero,C,&l));
465:           for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+(m/l)*l+j*m,C+j*l,sz));
466:         }
467:       }
468:     }

470:     PetscCallMPI(MPI_Type_free(&tmat));
471:   }

473:   PetscCall(PetscLogFlops(3.0*m*n*n));
474:   PetscCall(PetscFPTrapPop());
475:   PetscFunctionReturn(PETSC_SUCCESS);
476: }

478: /*
479:     Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
480:     all matrices are upper triangular stored in packed format
481: */
482: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
483: {
484:   PetscBLASInt   n,i,j,k,one=1;
485:   PetscMPIInt    tsize;
486:   PetscScalar    v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
487:   PetscReal      c;

489:   PetscFunctionBegin;
490:   PetscCallMPIAbort(PETSC_COMM_SELF,MPI_Type_size(*datatype,&tsize));  /* we assume len=1 */
491:   tsize /= sizeof(PetscScalar);
492:   n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
493:   for (j=0;j<n;j++) {
494:     for (i=0;i<=j;i++) {
495:       LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
496:       R1[(2*n-j-1)*j/2+j] = v;
497:       k = n-j-1;
498:       if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
499:     }
500:   }
501:   PetscFunctionReturnVoid();
502: }

504: /*
505:     Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
506: */
507: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
508: {
509:   PetscInt       worklen;
510:   PetscBLASInt   m,n,i,j,s,nb,lwork,info;
511:   PetscScalar    *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
512:   PetscMPIInt    size,count;
513:   MPI_Datatype   tmat;

515:   PetscFunctionBegin;
516:   PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
517:   PetscCall(PetscBLASIntCast(m_,&m));
518:   PetscCall(PetscBLASIntCast(n_,&n));
519:   nb = 16;
520:   s  = n+n*(n-1)/2;  /* length of packed triangular matrix */
521:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
522:   worklen = n+n*nb+2*s+m*n;
523:   PetscCall(BVAllocateWork_Private(bv,worklen));
524:   tau  = bv->work;
525:   work = bv->work+n;
526:   R1   = bv->work+n+n*nb;
527:   R2   = bv->work+n+n*nb+s;
528:   A    = bv->work+n+n*nb+2*s;
529:   PetscCall(PetscBLASIntCast(n*nb,&lwork));
530:   PetscCall(PetscArraycpy(A,Q,m*n));

532:   /* Compute QR */
533:   PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&m,tau,work,&lwork,&info));
534:   SlepcCheckLapackInfo("geqrf",info);

536:   if (size==1) {
537:     /* Extract R */
538:     for (j=0;j<n;j++) {
539:       for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*m];
540:       for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
541:     }
542:   } else {
543:     /* Use MPI reduction operation to obtain global R */
544:     PetscCall(PetscMPIIntCast(s,&count));
545:     PetscCallMPI(MPI_Type_contiguous(count,MPIU_SCALAR,&tmat));
546:     PetscCallMPI(MPI_Type_commit(&tmat));
547:     for (i=0;i<n;i++) {
548:       for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*m]:0.0;
549:     }
550:     PetscCall(MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv)));
551:     for (i=0;i<n;i++) {
552:       for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
553:       for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
554:     }
555:     PetscCallMPI(MPI_Type_free(&tmat));
556:   }

558:   PetscCall(PetscLogFlops(3.0*m*n*n));
559:   PetscCall(PetscFPTrapPop());
560:   PetscFunctionReturn(PETSC_SUCCESS);
561: }