Blender  V2.59
spanel_bmod.c
Go to the documentation of this file.
00001 
00005 /*
00006  * -- SuperLU routine (version 3.0) --
00007  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
00008  * and Lawrence Berkeley National Lab.
00009  * October 15, 2003
00010  *
00011  */
00012 /*
00013   Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
00014  
00015   THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
00016   EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
00017  
00018   Permission is hereby granted to use or copy this program for any
00019   purpose, provided the above notices are retained on all copies.
00020   Permission to modify the code and to distribute modified code is
00021   granted, provided the above notices are retained, and a notice that
00022   the code was modified is included with the above copyright notice.
00023 */
00024 
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include "ssp_defs.h"
00028 
00029 /* 
00030  * Function prototypes 
00031  */
00032 void slsolve(int, int, float *, float *);
00033 void smatvec(int, int, int, float *, float *, float *);
00034 extern void scheck_tempv();
00035 
00036 void
00037 spanel_bmod (
00038             const int  m,          /* in - number of rows in the matrix */
00039             const int  w,          /* in */
00040             const int  jcol,       /* in */
00041             const int  nseg,       /* in */
00042             float     *dense,     /* out, of size n by w */
00043             float     *tempv,     /* working array */
00044             int        *segrep,    /* in */
00045             int        *repfnz,    /* in, of size n by w */
00046             GlobalLU_t *Glu,       /* modified */
00047             SuperLUStat_t *stat    /* output */
00048             )
00049 {
00050 /* 
00051  * Purpose
00052  * =======
00053  *
00054  *    Performs numeric block updates (sup-panel) in topological order.
00055  *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
00056  *    Special processing on the supernodal portion of L\U[*,j]
00057  *
00058  *    Before entering this routine, the original nonzeros in the panel 
00059  *    were already copied into the spa[m,w].
00060  *
00061  *    Updated/Output parameters-
00062  *      dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 
00063  *      collectively in the m-by-w vector dense[*]. 
00064  *
00065  */
00066 
00067 #ifdef USE_VENDOR_BLAS
00068 #ifdef _CRAY
00069     _fcd ftcs1 = _cptofcd("L", strlen("L")),
00070          ftcs2 = _cptofcd("N", strlen("N")),
00071          ftcs3 = _cptofcd("U", strlen("U"));
00072 #endif
00073     int          incx = 1, incy = 1;
00074     float       alpha, beta;
00075 #endif
00076 
00077     register int k, ksub;
00078     int          fsupc, nsupc, nsupr, nrow;
00079     int          krep, krep_ind;
00080     float       ukj, ukj1, ukj2;
00081     int          luptr, luptr1, luptr2;
00082     int          segsze;
00083     int          block_nrow;  /* no of rows in a block row */
00084     register int lptr;        /* Points to the row subscripts of a supernode */
00085     int          kfnz, irow, no_zeros; 
00086     register int isub, isub1, i;
00087     register int jj;          /* Index through each column in the panel */
00088     int          *xsup, *supno;
00089     int          *lsub, *xlsub;
00090     float       *lusup;
00091     int          *xlusup;
00092     int          *repfnz_col; /* repfnz[] for a column in the panel */
00093     float       *dense_col;  /* dense[] for a column in the panel */
00094     float       *tempv1;             /* Used in 1-D update */
00095     float       *TriTmp, *MatvecTmp; /* used in 2-D update */
00096     float      zero = 0.0;
00097     register int ldaTmp;
00098     register int r_ind, r_hi;
00099     static   int first = 1, maxsuper, rowblk, colblk;
00100     flops_t  *ops = stat->ops;
00101     
00102     xsup    = Glu->xsup;
00103     supno   = Glu->supno;
00104     lsub    = Glu->lsub;
00105     xlsub   = Glu->xlsub;
00106     lusup   = Glu->lusup;
00107     xlusup  = Glu->xlusup;
00108     
00109     if ( first ) {
00110         maxsuper = sp_ienv(3);
00111         rowblk   = sp_ienv(4);
00112         colblk   = sp_ienv(5);
00113         first = 0;
00114     }
00115     ldaTmp = maxsuper + rowblk;
00116 
00117     /* 
00118      * For each nonz supernode segment of U[*,j] in topological order 
00119      */
00120     k = nseg - 1;
00121     for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
00122 
00123         /* krep = representative of current k-th supernode
00124          * fsupc = first supernodal column
00125          * nsupc = no of columns in a supernode
00126          * nsupr = no of rows in a supernode
00127          */
00128         krep = segrep[k--];
00129         fsupc = xsup[supno[krep]];
00130         nsupc = krep - fsupc + 1;
00131         nsupr = xlsub[fsupc+1] - xlsub[fsupc];
00132         nrow = nsupr - nsupc;
00133         lptr = xlsub[fsupc];
00134         krep_ind = lptr + nsupc - 1;
00135 
00136         repfnz_col = repfnz;
00137         dense_col = dense;
00138         
00139         if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
00140 
00141             TriTmp = tempv;
00142         
00143             /* Sequence through each column in panel -- triangular solves */
00144             for (jj = jcol; jj < jcol + w; jj++,
00145                  repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
00146 
00147                 kfnz = repfnz_col[krep];
00148                 if ( kfnz == EMPTY ) continue;  /* Skip any zero segment */
00149             
00150                 segsze = krep - kfnz + 1;
00151                 luptr = xlusup[fsupc];
00152 
00153                 ops[TRSV] += segsze * (segsze - 1);
00154                 ops[GEMV] += 2 * nrow * segsze;
00155         
00156                 /* Case 1: Update U-segment of size 1 -- col-col update */
00157                 if ( segsze == 1 ) {
00158                     ukj = dense_col[lsub[krep_ind]];
00159                     luptr += nsupr*(nsupc-1) + nsupc;
00160 
00161                     for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
00162                         irow = lsub[i];
00163                         dense_col[irow] -= ukj * lusup[luptr];
00164                         ++luptr;
00165                     }
00166 
00167                 } else if ( segsze <= 3 ) {
00168                     ukj = dense_col[lsub[krep_ind]];
00169                     ukj1 = dense_col[lsub[krep_ind - 1]];
00170                     luptr += nsupr*(nsupc-1) + nsupc-1;
00171                     luptr1 = luptr - nsupr;
00172 
00173                     if ( segsze == 2 ) {
00174                         ukj -= ukj1 * lusup[luptr1];
00175                         dense_col[lsub[krep_ind]] = ukj;
00176                         for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
00177                             irow = lsub[i];
00178                             luptr++; luptr1++;
00179                             dense_col[irow] -= (ukj*lusup[luptr]
00180                                                 + ukj1*lusup[luptr1]);
00181                         }
00182                     } else {
00183                         ukj2 = dense_col[lsub[krep_ind - 2]];
00184                         luptr2 = luptr1 - nsupr;
00185                         ukj1 -= ukj2 * lusup[luptr2-1];
00186                         ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
00187                         dense_col[lsub[krep_ind]] = ukj;
00188                         dense_col[lsub[krep_ind-1]] = ukj1;
00189                         for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
00190                             irow = lsub[i];
00191                             luptr++; luptr1++; luptr2++;
00192                             dense_col[irow] -= ( ukj*lusup[luptr]
00193                              + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
00194                         }
00195                     }
00196 
00197                 } else  {       /* segsze >= 4 */
00198                     
00199                     /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
00200                        holds the result of triangular solves.    */
00201                     no_zeros = kfnz - fsupc;
00202                     isub = lptr + no_zeros;
00203                     for (i = 0; i < segsze; ++i) {
00204                         irow = lsub[isub];
00205                         TriTmp[i] = dense_col[irow]; /* Gather */
00206                         ++isub;
00207                     }
00208                     
00209                     /* start effective triangle */
00210                     luptr += nsupr * no_zeros + no_zeros;
00211 
00212 #ifdef USE_VENDOR_BLAS
00213 #ifdef _CRAY
00214                     STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
00215                            &nsupr, TriTmp, &incx );
00216 #else
00217                     strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
00218                            &nsupr, TriTmp, &incx );
00219 #endif
00220 #else           
00221                     slsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
00222 #endif
00223                     
00224 
00225                 } /* else ... */
00226             
00227             }  /* for jj ... end tri-solves */
00228 
00229             /* Block row updates; push all the way into dense[*] block */
00230             for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
00231                 
00232                 r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
00233                 block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
00234                 luptr = xlusup[fsupc] + nsupc + r_ind;
00235                 isub1 = lptr + nsupc + r_ind;
00236                 
00237                 repfnz_col = repfnz;
00238                 TriTmp = tempv;
00239                 dense_col = dense;
00240                 
00241                 /* Sequence through each column in panel -- matrix-vector */
00242                 for (jj = jcol; jj < jcol + w; jj++,
00243                      repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
00244                     
00245                     kfnz = repfnz_col[krep];
00246                     if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
00247                     
00248                     segsze = krep - kfnz + 1;
00249                     if ( segsze <= 3 ) continue;   /* skip unrolled cases */
00250                     
00251                     /* Perform a block update, and scatter the result of
00252                        matrix-vector to dense[].                 */
00253                     no_zeros = kfnz - fsupc;
00254                     luptr1 = luptr + nsupr * no_zeros;
00255                     MatvecTmp = &TriTmp[maxsuper];
00256                     
00257 #ifdef USE_VENDOR_BLAS
00258                     alpha = one; 
00259                     beta = zero;
00260 #ifdef _CRAY
00261                     SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], 
00262                            &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
00263 #else
00264                     sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], 
00265                            &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
00266 #endif
00267 #else
00268                     smatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
00269                            TriTmp, MatvecTmp);
00270 #endif
00271                     
00272                     /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
00273                      * such that MatvecTmp[*] can be re-used for the
00274                      * the next blok row update. dense[] will be copied into 
00275                      * global store after the whole panel has been finished.
00276                      */
00277                     isub = isub1;
00278                     for (i = 0; i < block_nrow; i++) {
00279                         irow = lsub[isub];
00280                         dense_col[irow] -= MatvecTmp[i];
00281                         MatvecTmp[i] = zero;
00282                         ++isub;
00283                     }
00284                     
00285                 } /* for jj ... */
00286                 
00287             } /* for each block row ... */
00288             
00289             /* Scatter the triangular solves into SPA dense[*] */
00290             repfnz_col = repfnz;
00291             TriTmp = tempv;
00292             dense_col = dense;
00293             
00294             for (jj = jcol; jj < jcol + w; jj++,
00295                  repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
00296                 kfnz = repfnz_col[krep];
00297                 if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
00298                 
00299                 segsze = krep - kfnz + 1;
00300                 if ( segsze <= 3 ) continue; /* skip unrolled cases */
00301                 
00302                 no_zeros = kfnz - fsupc;                
00303                 isub = lptr + no_zeros;
00304                 for (i = 0; i < segsze; i++) {
00305                     irow = lsub[isub];
00306                     dense_col[irow] = TriTmp[i];
00307                     TriTmp[i] = zero;
00308                     ++isub;
00309                 }
00310                 
00311             } /* for jj ... */
00312             
00313         } else { /* 1-D block modification */
00314             
00315             
00316             /* Sequence through each column in the panel */
00317             for (jj = jcol; jj < jcol + w; jj++,
00318                  repfnz_col += m, dense_col += m) {
00319                 
00320                 kfnz = repfnz_col[krep];
00321                 if ( kfnz == EMPTY ) continue;  /* Skip any zero segment */
00322                 
00323                 segsze = krep - kfnz + 1;
00324                 luptr = xlusup[fsupc];
00325 
00326                 ops[TRSV] += segsze * (segsze - 1);
00327                 ops[GEMV] += 2 * nrow * segsze;
00328                 
00329                 /* Case 1: Update U-segment of size 1 -- col-col update */
00330                 if ( segsze == 1 ) {
00331                     ukj = dense_col[lsub[krep_ind]];
00332                     luptr += nsupr*(nsupc-1) + nsupc;
00333 
00334                     for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
00335                         irow = lsub[i];
00336                         dense_col[irow] -= ukj * lusup[luptr];
00337                         ++luptr;
00338                     }
00339 
00340                 } else if ( segsze <= 3 ) {
00341                     ukj = dense_col[lsub[krep_ind]];
00342                     luptr += nsupr*(nsupc-1) + nsupc-1;
00343                     ukj1 = dense_col[lsub[krep_ind - 1]];
00344                     luptr1 = luptr - nsupr;
00345 
00346                     if ( segsze == 2 ) {
00347                         ukj -= ukj1 * lusup[luptr1];
00348                         dense_col[lsub[krep_ind]] = ukj;
00349                         for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
00350                             irow = lsub[i];
00351                             ++luptr;  ++luptr1;
00352                             dense_col[irow] -= (ukj*lusup[luptr]
00353                                                 + ukj1*lusup[luptr1]);
00354                         }
00355                     } else {
00356                         ukj2 = dense_col[lsub[krep_ind - 2]];
00357                         luptr2 = luptr1 - nsupr;
00358                         ukj1 -= ukj2 * lusup[luptr2-1];
00359                         ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
00360                         dense_col[lsub[krep_ind]] = ukj;
00361                         dense_col[lsub[krep_ind-1]] = ukj1;
00362                         for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
00363                             irow = lsub[i];
00364                             ++luptr; ++luptr1; ++luptr2;
00365                             dense_col[irow] -= ( ukj*lusup[luptr]
00366                              + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
00367                         }
00368                     }
00369 
00370                 } else  { /* segsze >= 4 */
00371                     /* 
00372                      * Perform a triangular solve and block update,
00373                      * then scatter the result of sup-col update to dense[].
00374                      */
00375                     no_zeros = kfnz - fsupc;
00376                     
00377                     /* Copy U[*,j] segment from dense[*] to tempv[*]: 
00378                      *    The result of triangular solve is in tempv[*];
00379                      *    The result of matrix vector update is in dense_col[*]
00380                      */
00381                     isub = lptr + no_zeros;
00382                     for (i = 0; i < segsze; ++i) {
00383                         irow = lsub[isub];
00384                         tempv[i] = dense_col[irow]; /* Gather */
00385                         ++isub;
00386                     }
00387                     
00388                     /* start effective triangle */
00389                     luptr += nsupr * no_zeros + no_zeros;
00390                     
00391 #ifdef USE_VENDOR_BLAS
00392 #ifdef _CRAY
00393                     STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
00394                            &nsupr, tempv, &incx );
00395 #else
00396                     strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
00397                            &nsupr, tempv, &incx );
00398 #endif
00399                     
00400                     luptr += segsze;    /* Dense matrix-vector */
00401                     tempv1 = &tempv[segsze];
00402                     alpha = one;
00403                     beta = zero;
00404 #ifdef _CRAY
00405                     SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
00406                            &nsupr, tempv, &incx, &beta, tempv1, &incy );
00407 #else
00408                     sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
00409                            &nsupr, tempv, &incx, &beta, tempv1, &incy );
00410 #endif
00411 #else
00412                     slsolve ( nsupr, segsze, &lusup[luptr], tempv );
00413                     
00414                     luptr += segsze;        /* Dense matrix-vector */
00415                     tempv1 = &tempv[segsze];
00416                     smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
00417 #endif
00418                     
00419                     /* Scatter tempv[*] into SPA dense[*] temporarily, such
00420                      * that tempv[*] can be used for the triangular solve of
00421                      * the next column of the panel. They will be copied into 
00422                      * ucol[*] after the whole panel has been finished.
00423                      */
00424                     isub = lptr + no_zeros;
00425                     for (i = 0; i < segsze; i++) {
00426                         irow = lsub[isub];
00427                         dense_col[irow] = tempv[i];
00428                         tempv[i] = zero;
00429                         isub++;
00430                     }
00431                     
00432                     /* Scatter the update from tempv1[*] into SPA dense[*] */
00433                     /* Start dense rectangular L */
00434                     for (i = 0; i < nrow; i++) {
00435                         irow = lsub[isub];
00436                         dense_col[irow] -= tempv1[i];
00437                         tempv1[i] = zero;
00438                         ++isub; 
00439                     }
00440                     
00441                 } /* else segsze>=4 ... */
00442                 
00443             } /* for each column in the panel... */
00444             
00445         } /* else 1-D update ... */
00446 
00447     } /* for each updating supernode ... */
00448 
00449 }
00450 
00451 
00452