Source for org.jfree.report.util.CSVTokenizer

   1: /**
   2:  * ========================================
   3:  * JFreeReport : a free Java report library
   4:  * ========================================
   5:  *
   6:  * Project Info:  http://reporting.pentaho.org/
   7:  *
   8:  * (C) Copyright 2000-2007, by Object Refinery Limited, Pentaho Corporation and Contributors.
   9:  *
  10:  * This library is free software; you can redistribute it and/or modify it under the terms
  11:  * of the GNU Lesser General Public License as published by the Free Software Foundation;
  12:  * either version 2.1 of the License, or (at your option) any later version.
  13:  *
  14:  * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
  15:  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16:  * See the GNU Lesser General Public License for more details.
  17:  *
  18:  * You should have received a copy of the GNU Lesser General Public License along with this
  19:  * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  20:  * Boston, MA 02111-1307, USA.
  21:  *
  22:  * [Java is a trademark or registered trademark of Sun Microsystems, Inc.
  23:  * in the United States and other countries.]
  24:  *
  25:  * ------------
  26:  * $Id: CSVTokenizer.java 3525 2007-10-16 11:43:48Z tmorgner $
  27:  * ------------
  28:  * (C) Copyright 2000-2005, by Object Refinery Limited.
  29:  * (C) Copyright 2005-2007, by Pentaho Corporation.
  30:  */
  31: package org.jfree.report.util;
  32: 
  33: import java.util.Enumeration;
  34: import java.util.NoSuchElementException;
  35: 
  36: /**
  37:  * The csv tokenizer class allows an application to break a Comma Separated Value format
  38:  * into tokens. The tokenization method is much simpler than the one used by the
  39:  * <code>StringTokenizer</code> class. The <code>CSVTokenizer</code> methods do not
  40:  * distinguish among identifiers, numbers, and quoted strings, nor do they recognize and
  41:  * skip comments.
  42:  * <p/>
  43:  * The set of separator (the characters that separate tokens) may be specified either at
  44:  * creation time or on a per-token basis.
  45:  * <p/>
  46:  * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on
  47:  * whether it was created with the <code>returnSeparators</code> flag having the value
  48:  * <code>true</code> or <code>false</code>: <ul> <li>If the flag is <code>false</code>,
  49:  * delimiter characters serve to separate tokens. A token is a maximal sequence of
  50:  * consecutive characters that are not separator. <li>If the flag is <code>true</code>,
  51:  * delimiter characters are themselves considered to be tokens. A token is thus either one
  52:  * delimiter character, or a maximal sequence of consecutive characters that are not
  53:  * separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current
  54:  * position within the string to be tokenized. Some operations advance this current
  55:  * position past the characters processed.<p> A token is returned by taking a substring of
  56:  * the string that was used to create the <tt>CSVTokenizer</tt> object.
  57:  * <p/>
  58:  * The following is one example of the use of the tokenizer. The code:
  59:  * <blockquote><pre>
  60:  *     CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
  61:  *     while (csvt.hasMoreTokens()) {
  62:  *         println(csvt.nextToken());
  63:  *     }
  64:  * </pre></blockquote>
  65:  * <p/>
  66:  * prints the following output:
  67:  * <blockquote><pre>
  68:  *     this
  69:  *     is
  70:  *     a
  71:  *     test
  72:  * </pre></blockquote>
  73:  *
  74:  * @author abupon
  75:  */
  76: public class CSVTokenizer implements Enumeration
  77: {
  78:   /**
  79:    * The complete record that should be separated into elements.
  80:    */
  81:   private String record;
  82:   /**
  83:    * The separator.
  84:    */
  85:   private String separator;
  86:   /**
  87:    * The quoting char.
  88:    */
  89:   private String quate;
  90: 
  91:   /**
  92:    * the current parsing position.
  93:    */
  94:   private int currentIndex;
  95: 
  96:   private boolean beforeStart;
  97: 
  98:   /**
  99:    * A possible separator constant.
 100:    */
 101:   public static final String SEPARATOR_COMMA = ",";
 102:   /**
 103:    * A possible separator constant.
 104:    */
 105:   public static final String SEPARATOR_TAB = "\t";
 106:   /**
 107:    * A possible separator constant.
 108:    */
 109:   public static final String SEPARATOR_SPACE = " ";
 110: 
 111:   /**
 112:    * A possible quote character constant.
 113:    */
 114:   public static final String DOUBLE_QUATE = "\"";
 115:   /**
 116:    * A possible quote character constant.
 117:    */
 118:   public static final String SINGLE_QUATE = "'";
 119: 
 120:   /**
 121:    * Constructs a csv tokenizer for the specified string. <code>theSeparator</code>
 122:    * argument is the separator for separating tokens.
 123:    * <p/>
 124:    * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator
 125:    * string is also returned as tokens. separator is returned as a string. If the flag is
 126:    * <code>false</code>, the separator string is skipped and only serve as separator
 127:    * between tokens.
 128:    *
 129:    * @param aString      a string to be parsed.
 130:    * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB,
 131:    *                     CSVTokenizer.SPACE, etc.).
 132:    * @param theQuate     the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE,
 133:    *                     etc.).
 134:    */
 135:   public CSVTokenizer (final String aString, final String theSeparator,
 136:                        final String theQuate)
 137:   {
 138:     if (aString == null)
 139:     {
 140:       throw new NullPointerException("The given string is null");
 141:     }
 142:     if (theSeparator == null)
 143:     {
 144:       throw new NullPointerException("The given separator is null");
 145:     }
 146:     if (theQuate == null)
 147:     {
 148:       throw new NullPointerException("The given quate is null");
 149:     }
 150:     this.record = aString.trim();
 151:     this.separator = theSeparator;
 152:     this.quate = theQuate;
 153:     this.currentIndex = 0;
 154:     this.beforeStart = true;
 155:   }
 156: 
 157:   /**
 158:    * Constructs a csv tokenizer for the specified string. The characters in the
 159:    * <code>theSeparator</code> argument are the separator for separating tokens. Separator
 160:    * string themselves will not be treated as tokens.
 161:    *
 162:    * @param aString      a string to be parsed.
 163:    * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB,
 164:    *                     CSVTokenizer.SPACE, etc.).
 165:    */
 166:   public CSVTokenizer (final String aString, final String theSeparator)
 167:   {
 168:     this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE);
 169:   }
 170: 
 171:   /**
 172:    * Constructs a string tokenizer for the specified string. The tokenizer uses the
 173:    * default separator set, which is <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator
 174:    * string themselves will not be treated as tokens.
 175:    *
 176:    * @param aString a string to be parsed.
 177:    */
 178:   public CSVTokenizer (final String aString)
 179:   {
 180:     this(aString, CSVTokenizer.SEPARATOR_COMMA);
 181:   }
 182: 
 183:   /**
 184:    * Tests if there are more tokens available from this tokenizer's string. If this method
 185:    * returns <tt>true</tt>, then a subsequent call to <tt>nextToken</tt> with no argument
 186:    * will successfully return a token.
 187:    *
 188:    * @return <code>true</code> if and only if there is at least one token in the string
 189:    *         after the current position; <code>false</code> otherwise.
 190:    */
 191:   public boolean hasMoreTokens ()
 192:   {
 193:     return (this.currentIndex < this.record.length());
 194:   }
 195: 
 196:   /**
 197:    * Returns the next token from this string tokenizer.
 198:    *
 199:    * @return the next token from this string tokenizer.
 200:    *
 201:    * @throws NoSuchElementException   if there are no more tokens in this tokenizer's
 202:    *                                  string.
 203:    * @throws IllegalArgumentException if given parameter string format was wrong
 204:    */
 205:   public String nextToken ()
 206:           throws NoSuchElementException, IllegalArgumentException
 207:   {
 208: 
 209:     if (!this.hasMoreTokens())
 210:     {
 211:       throw new NoSuchElementException();
 212:     }
 213: 
 214:     if (beforeStart == false)
 215:     {
 216:       currentIndex += this.separator.length();
 217:     }
 218:     else
 219:     {
 220:       beforeStart = false;
 221:     }
 222: 
 223:     StringBuffer token = new StringBuffer();
 224:     if (this.record.startsWith(this.quate, this.currentIndex))
 225:     {
 226:       String rec = this.record.substring(this.currentIndex + this.quate.length());
 227:       token.delete(0, token.length());
 228:       while (true)
 229:       {
 230:         final int end = rec.indexOf(this.quate);
 231:         if (end < 0)
 232:         {
 233:           throw new IllegalArgumentException("Illegal format");
 234:         }
 235: 
 236:         if (!rec.startsWith(this.quate, end + 1))
 237:         {
 238:           token.append(rec.substring(0, end));
 239:           break;
 240:         }
 241:         token.append(rec.substring(0, end + 1));
 242:         rec = rec.substring(end + this.quate.length() * 2);
 243:         this.currentIndex++;
 244:       }
 245: 
 246:       this.currentIndex += (token.length() + this.quate.length() * 2);
 247:     }
 248:     else
 249:     {
 250:       final int end = this.record.indexOf(this.separator, this.currentIndex);
 251:       if (end >= 0)
 252:       {
 253:         final int start = this.currentIndex;
 254:         token.delete(0, token.length());
 255:         token.append(this.record.substring(start, end));
 256:         this.currentIndex = end;
 257:       }
 258:       else
 259:       {
 260:         final int start = this.currentIndex;
 261:         token.delete(0, token.length());
 262:         token.append(this.record.substring(start));
 263:         this.currentIndex = this.record.length();
 264:       }
 265:     }
 266: 
 267:     return token.toString();
 268:   }
 269: 
 270:   /**
 271:    * Returns the next token in this string tokenizer's string. First, the set of
 272:    * characters considered to be separator by this <tt>CSVTokenizer</tt> object is changed
 273:    * to be the characters in the string <tt>separator</tt>. Then the next token in the
 274:    * string after the current position is returned. The current position is advanced
 275:    * beyond the recognized token.  The new delimiter set remains the default after this
 276:    * call.
 277:    *
 278:    * @param theSeparator the new separator.
 279:    * @return the next token, after switching to the new delimiter set.
 280:    *
 281:    * @throws java.util.NoSuchElementException
 282:    *          if there are no more tokens in this tokenizer's string.
 283:    */
 284:   public String nextToken (final String theSeparator)
 285:   {
 286:     separator = theSeparator;
 287:     return nextToken();
 288:   }
 289: 
 290:   /**
 291:    * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that
 292:    * this class can implement the <code>Enumeration</code> interface.
 293:    *
 294:    * @return <code>true</code> if there are more tokens; <code>false</code> otherwise.
 295:    *
 296:    * @see java.util.Enumeration
 297:    * @see org.jfree.report.util.CSVTokenizer#hasMoreTokens()
 298:    */
 299:   public boolean hasMoreElements ()
 300:   {
 301:     return hasMoreTokens();
 302:   }
 303: 
 304:   /**
 305:    * Returns the same value as the <code>nextToken</code> method, except that its declared
 306:    * return value is <code>Object</code> rather than <code>String</code>. It exists so
 307:    * that this class can implement the <code>Enumeration</code> interface.
 308:    *
 309:    * @return the next token in the string.
 310:    *
 311:    * @throws java.util.NoSuchElementException
 312:    *          if there are no more tokens in this tokenizer's string.
 313:    * @see java.util.Enumeration
 314:    * @see org.jfree.report.util.CSVTokenizer#nextToken()
 315:    */
 316:   public Object nextElement ()
 317:   {
 318:     return nextToken();
 319:   }
 320: 
 321:   /**
 322:    * Calculates the number of times that this tokenizer's <code>nextToken</code> method
 323:    * can be called before it generates an exception. The current position is not
 324:    * advanced.
 325:    *
 326:    * @return the number of tokens remaining in the string using the current delimiter
 327:    *         set.
 328:    *
 329:    * @see org.jfree.report.util.CSVTokenizer#nextToken()
 330:    */
 331:   public int countTokens ()
 332:   {
 333:     int count = 0;
 334: 
 335:     final int preserve = this.currentIndex;
 336:     final boolean preserveStart = this.beforeStart;
 337:     while (this.hasMoreTokens())
 338:     {
 339:       this.nextToken();
 340:       count++;
 341:     }
 342:     this.currentIndex = preserve;
 343:     this.beforeStart = preserveStart;
 344: 
 345:     return count;
 346:   }
 347: 
 348:   /**
 349:    * Returns the quate.
 350:    *
 351:    * @return char
 352:    */
 353:   public String getQuate ()
 354:   {
 355:     return this.quate;
 356:   }
 357: 
 358:   /**
 359:    * Sets the quate.
 360:    *
 361:    * @param quate The quate to set
 362:    */
 363:   public void setQuate (final String quate)
 364:   {
 365:     this.quate = quate;
 366:   }
 367: }