Blender  V2.59
scolumn_bmod.c
Go to the documentation of this file.
00001 
00005 /*
00006  * -- SuperLU routine (version 3.0) --
00007  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
00008  * and Lawrence Berkeley National Lab.
00009  * October 15, 2003
00010  *
00011  */
00012 /*
00013   Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
00014  
00015   THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
00016   EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
00017  
00018   Permission is hereby granted to use or copy this program for any
00019   purpose, provided the above notices are retained on all copies.
00020   Permission to modify the code and to distribute modified code is
00021   granted, provided the above notices are retained, and a notice that
00022   the code was modified is included with the above copyright notice.
00023 */
00024 
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include "ssp_defs.h"
00028 
00029 /* 
00030  * Function prototypes 
00031  */
00032 void susolve(int, int, float*, float*);
00033 void slsolve(int, int, float*, float*);
00034 void smatvec(int, int, int, float*, float*, float*);
00035 
00036 
00037 
00038 /* Return value:   0 - successful return
00039  *               > 0 - number of bytes allocated when run out of space
00040  */
00041 int
00042 scolumn_bmod (
00043              const int  jcol,     /* in */
00044              const int  nseg,     /* in */
00045              float     *dense,    /* in */
00046              float     *tempv,    /* working array */
00047              int        *segrep,  /* in */
00048              int        *repfnz,  /* in */
00049              int        fpanelc,  /* in -- first column in the current panel */
00050              GlobalLU_t *Glu,     /* modified */
00051              SuperLUStat_t *stat  /* output */
00052              )
00053 {
00054 /*
00055  * Purpose:
00056  * ========
00057  *    Performs numeric block updates (sup-col) in topological order.
00058  *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
00059  *    Special processing on the supernodal portion of L\U[*,j]
00060  *
00061  */
00062 #ifdef _CRAY
00063     _fcd ftcs1 = _cptofcd("L", strlen("L")),
00064          ftcs2 = _cptofcd("N", strlen("N")),
00065          ftcs3 = _cptofcd("U", strlen("U"));
00066 #endif
00067 
00068 #ifdef USE_VENDOR_BLAS
00069     int         incx = 1, incy = 1;
00070     float      alpha, beta;
00071 #endif
00072     
00073     /* krep = representative of current k-th supernode
00074      * fsupc = first supernodal column
00075      * nsupc = no of columns in supernode
00076      * nsupr = no of rows in supernode (used as leading dimension)
00077      * luptr = location of supernodal LU-block in storage
00078      * kfnz = first nonz in the k-th supernodal segment
00079      * no_zeros = no of leading zeros in a supernodal U-segment
00080      */
00081     float       ukj, ukj1, ukj2;
00082     int          luptr, luptr1, luptr2;
00083     int          fsupc, nsupc, nsupr, segsze;
00084     int          nrow;    /* No of rows in the matrix of matrix-vector */
00085     int          jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
00086     register int lptr, kfnz, isub, irow, i;
00087     register int no_zeros, new_next; 
00088     int          ufirst, nextlu;
00089     int          fst_col; /* First column within small LU update */
00090     int          d_fsupc; /* Distance between the first column of the current
00091                              panel and the first column of the current snode. */
00092     int          *xsup, *supno;
00093     int          *lsub, *xlsub;
00094     float       *lusup;
00095     int          *xlusup;
00096     int          nzlumax;
00097     float       *tempv1;
00098     float      zero = 0.0;
00099 #ifdef USE_VENDOR_BLAS
00100     float      one = 1.0;
00101     float      none = -1.0;
00102 #endif
00103     int          mem_error;
00104     flops_t      *ops = stat->ops;
00105 
00106     xsup    = Glu->xsup;
00107     supno   = Glu->supno;
00108     lsub    = Glu->lsub;
00109     xlsub   = Glu->xlsub;
00110     lusup   = Glu->lusup;
00111     xlusup  = Glu->xlusup;
00112     nzlumax = Glu->nzlumax;
00113     jcolp1 = jcol + 1;
00114     jsupno = supno[jcol];
00115     
00116     /* 
00117      * For each nonz supernode segment of U[*,j] in topological order 
00118      */
00119     k = nseg - 1;
00120     for (ksub = 0; ksub < nseg; ksub++) {
00121 
00122         krep = segrep[k];
00123         k--;
00124         ksupno = supno[krep];
00125         if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
00126 
00127             fsupc = xsup[ksupno];
00128             fst_col = SUPERLU_MAX ( fsupc, fpanelc );
00129 
00130             /* Distance from the current supernode to the current panel; 
00131                d_fsupc=0 if fsupc > fpanelc. */
00132             d_fsupc = fst_col - fsupc; 
00133 
00134             luptr = xlusup[fst_col] + d_fsupc;
00135             lptr = xlsub[fsupc] + d_fsupc;
00136 
00137             kfnz = repfnz[krep];
00138             kfnz = SUPERLU_MAX ( kfnz, fpanelc );
00139 
00140             segsze = krep - kfnz + 1;
00141             nsupc = krep - fst_col + 1;
00142             nsupr = xlsub[fsupc+1] - xlsub[fsupc];      /* Leading dimension */
00143             nrow = nsupr - d_fsupc - nsupc;
00144             krep_ind = lptr + nsupc - 1;
00145 
00146             ops[TRSV] += segsze * (segsze - 1);
00147             ops[GEMV] += 2 * nrow * segsze;
00148 
00149 
00150             /* 
00151              * Case 1: Update U-segment of size 1 -- col-col update 
00152              */
00153             if ( segsze == 1 ) {
00154                 ukj = dense[lsub[krep_ind]];
00155                 luptr += nsupr*(nsupc-1) + nsupc;
00156 
00157                 for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
00158                     irow = lsub[i];
00159                     dense[irow] -=  ukj*lusup[luptr];
00160                     luptr++;
00161                 }
00162 
00163             } else if ( segsze <= 3 ) {
00164                 ukj = dense[lsub[krep_ind]];
00165                 luptr += nsupr*(nsupc-1) + nsupc-1;
00166                 ukj1 = dense[lsub[krep_ind - 1]];
00167                 luptr1 = luptr - nsupr;
00168 
00169                 if ( segsze == 2 ) { /* Case 2: 2cols-col update */
00170                     ukj -= ukj1 * lusup[luptr1];
00171                     dense[lsub[krep_ind]] = ukj;
00172                     for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
00173                         irow = lsub[i];
00174                         luptr++;
00175                         luptr1++;
00176                         dense[irow] -= ( ukj*lusup[luptr]
00177                                         + ukj1*lusup[luptr1] );
00178                     }
00179                 } else { /* Case 3: 3cols-col update */
00180                     ukj2 = dense[lsub[krep_ind - 2]];
00181                     luptr2 = luptr1 - nsupr;
00182                     ukj1 -= ukj2 * lusup[luptr2-1];
00183                     ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
00184                     dense[lsub[krep_ind]] = ukj;
00185                     dense[lsub[krep_ind-1]] = ukj1;
00186                     for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
00187                         irow = lsub[i];
00188                         luptr++;
00189                         luptr1++;
00190                         luptr2++;
00191                         dense[irow] -= ( ukj*lusup[luptr]
00192                              + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
00193                     }
00194                 }
00195 
00196 
00197 
00198             } else {
00199                 /*
00200                  * Case: sup-col update
00201                  * Perform a triangular solve and block update,
00202                  * then scatter the result of sup-col update to dense
00203                  */
00204 
00205                 no_zeros = kfnz - fst_col;
00206 
00207                 /* Copy U[*,j] segment from dense[*] to tempv[*] */
00208                 isub = lptr + no_zeros;
00209                 for (i = 0; i < segsze; i++) {
00210                     irow = lsub[isub];
00211                     tempv[i] = dense[irow];
00212                     ++isub; 
00213                 }
00214 
00215                 /* Dense triangular solve -- start effective triangle */
00216                 luptr += nsupr * no_zeros + no_zeros; 
00217                 
00218 #ifdef USE_VENDOR_BLAS
00219 #ifdef _CRAY
00220                 STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
00221                        &nsupr, tempv, &incx );
00222 #else           
00223                 strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
00224                        &nsupr, tempv, &incx );
00225 #endif          
00226                 luptr += segsze;  /* Dense matrix-vector */
00227                 tempv1 = &tempv[segsze];
00228                 alpha = one;
00229                 beta = zero;
00230 #ifdef _CRAY
00231                 SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
00232                        &nsupr, tempv, &incx, &beta, tempv1, &incy );
00233 #else
00234                 sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
00235                        &nsupr, tempv, &incx, &beta, tempv1, &incy );
00236 #endif
00237 #else
00238                 slsolve ( nsupr, segsze, &lusup[luptr], tempv );
00239 
00240                 luptr += segsze;  /* Dense matrix-vector */
00241                 tempv1 = &tempv[segsze];
00242                 smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
00243 #endif
00244                 
00245                 
00246                 /* Scatter tempv[] into SPA dense[] as a temporary storage */
00247                 isub = lptr + no_zeros;
00248                 for (i = 0; i < segsze; i++) {
00249                     irow = lsub[isub];
00250                     dense[irow] = tempv[i];
00251                     tempv[i] = zero;
00252                     ++isub;
00253                 }
00254 
00255                 /* Scatter tempv1[] into SPA dense[] */
00256                 for (i = 0; i < nrow; i++) {
00257                     irow = lsub[isub];
00258                     dense[irow] -= tempv1[i];
00259                     tempv1[i] = zero;
00260                     ++isub;
00261                 }
00262             }
00263             
00264         } /* if jsupno ... */
00265 
00266     } /* for each segment... */
00267 
00268     /*
00269      *  Process the supernodal portion of L\U[*,j]
00270      */
00271     nextlu = xlusup[jcol];
00272     fsupc = xsup[jsupno];
00273 
00274     /* Copy the SPA dense into L\U[*,j] */
00275     new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
00276     while ( new_next > nzlumax ) {
00277         if ((mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)))
00278             return (mem_error);
00279         lusup = Glu->lusup;
00280         lsub = Glu->lsub;
00281     }
00282 
00283     for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
00284         irow = lsub[isub];
00285         lusup[nextlu] = dense[irow];
00286         dense[irow] = zero;
00287         ++nextlu;
00288     }
00289 
00290     xlusup[jcolp1] = nextlu;    /* Close L\U[*,jcol] */
00291 
00292     /* For more updates within the panel (also within the current supernode), 
00293      * should start from the first column of the panel, or the first column 
00294      * of the supernode, whichever is bigger. There are 2 cases:
00295      *    1) fsupc < fpanelc, then fst_col := fpanelc
00296      *    2) fsupc >= fpanelc, then fst_col := fsupc
00297      */
00298     fst_col = SUPERLU_MAX ( fsupc, fpanelc );
00299 
00300     if ( fst_col < jcol ) {
00301 
00302         /* Distance between the current supernode and the current panel.
00303            d_fsupc=0 if fsupc >= fpanelc. */
00304         d_fsupc = fst_col - fsupc;
00305 
00306         lptr = xlsub[fsupc] + d_fsupc;
00307         luptr = xlusup[fst_col] + d_fsupc;
00308         nsupr = xlsub[fsupc+1] - xlsub[fsupc];  /* Leading dimension */
00309         nsupc = jcol - fst_col; /* Excluding jcol */
00310         nrow = nsupr - d_fsupc - nsupc;
00311 
00312         /* Points to the beginning of jcol in snode L\U(jsupno) */
00313         ufirst = xlusup[jcol] + d_fsupc;        
00314 
00315         ops[TRSV] += nsupc * (nsupc - 1);
00316         ops[GEMV] += 2 * nrow * nsupc;
00317         
00318 #ifdef USE_VENDOR_BLAS
00319 #ifdef _CRAY
00320         STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
00321                &nsupr, &lusup[ufirst], &incx );
00322 #else
00323         strsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
00324                &nsupr, &lusup[ufirst], &incx );
00325 #endif
00326         
00327         alpha = none; beta = one; /* y := beta*y + alpha*A*x */
00328 
00329 #ifdef _CRAY
00330         SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
00331                &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
00332 #else
00333         sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
00334                &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
00335 #endif
00336 #else
00337         slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
00338 
00339         smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
00340                 &lusup[ufirst], tempv );
00341         
00342         /* Copy updates from tempv[*] into lusup[*] */
00343         isub = ufirst + nsupc;
00344         for (i = 0; i < nrow; i++) {
00345             lusup[isub] -= tempv[i];
00346             tempv[i] = 0.0;
00347             ++isub;
00348         }
00349 
00350 #endif
00351         
00352         
00353     } /* if fst_col < jcol ... */ 
00354 
00355     return 0;
00356 }