001 /** 002 * ======================================== 003 * JFreeReport : a free Java report library 004 * ======================================== 005 * 006 * Project Info: http://reporting.pentaho.org/ 007 * 008 * (C) Copyright 2000-2007, by Object Refinery Limited, Pentaho Corporation and Contributors. 009 * 010 * This library is free software; you can redistribute it and/or modify it under the terms 011 * of the GNU Lesser General Public License as published by the Free Software Foundation; 012 * either version 2.1 of the License, or (at your option) any later version. 013 * 014 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 015 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 016 * See the GNU Lesser General Public License for more details. 017 * 018 * You should have received a copy of the GNU Lesser General Public License along with this 019 * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 020 * Boston, MA 02111-1307, USA. 021 * 022 * [Java is a trademark or registered trademark of Sun Microsystems, Inc. 023 * in the United States and other countries.] 024 * 025 * ------------ 026 * $Id: CSVTokenizer.java 3525 2007-10-16 11:43:48Z tmorgner $ 027 * ------------ 028 * (C) Copyright 2000-2005, by Object Refinery Limited. 029 * (C) Copyright 2005-2007, by Pentaho Corporation. 030 */ 031 package org.jfree.report.util; 032 033 import java.util.Enumeration; 034 import java.util.NoSuchElementException; 035 036 /** 037 * The csv tokenizer class allows an application to break a Comma Separated Value format 038 * into tokens. The tokenization method is much simpler than the one used by the 039 * <code>StringTokenizer</code> class. The <code>CSVTokenizer</code> methods do not 040 * distinguish among identifiers, numbers, and quoted strings, nor do they recognize and 041 * skip comments. 042 * <p/> 043 * The set of separator (the characters that separate tokens) may be specified either at 044 * creation time or on a per-token basis. 045 * <p/> 046 * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on 047 * whether it was created with the <code>returnSeparators</code> flag having the value 048 * <code>true</code> or <code>false</code>: <ul> <li>If the flag is <code>false</code>, 049 * delimiter characters serve to separate tokens. A token is a maximal sequence of 050 * consecutive characters that are not separator. <li>If the flag is <code>true</code>, 051 * delimiter characters are themselves considered to be tokens. A token is thus either one 052 * delimiter character, or a maximal sequence of consecutive characters that are not 053 * separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current 054 * position within the string to be tokenized. Some operations advance this current 055 * position past the characters processed.<p> A token is returned by taking a substring of 056 * the string that was used to create the <tt>CSVTokenizer</tt> object. 057 * <p/> 058 * The following is one example of the use of the tokenizer. The code: 059 * <blockquote><pre> 060 * CSVTokenizer csvt = new CSVTokenizer("this,is,a,test"); 061 * while (csvt.hasMoreTokens()) { 062 * println(csvt.nextToken()); 063 * } 064 * </pre></blockquote> 065 * <p/> 066 * prints the following output: 067 * <blockquote><pre> 068 * this 069 * is 070 * a 071 * test 072 * </pre></blockquote> 073 * 074 * @author abupon 075 */ 076 public class CSVTokenizer implements Enumeration 077 { 078 /** 079 * The complete record that should be separated into elements. 080 */ 081 private String record; 082 /** 083 * The separator. 084 */ 085 private String separator; 086 /** 087 * The quoting char. 088 */ 089 private String quate; 090 091 /** 092 * the current parsing position. 093 */ 094 private int currentIndex; 095 096 private boolean beforeStart; 097 098 /** 099 * A possible separator constant. 100 */ 101 public static final String SEPARATOR_COMMA = ","; 102 /** 103 * A possible separator constant. 104 */ 105 public static final String SEPARATOR_TAB = "\t"; 106 /** 107 * A possible separator constant. 108 */ 109 public static final String SEPARATOR_SPACE = " "; 110 111 /** 112 * A possible quote character constant. 113 */ 114 public static final String DOUBLE_QUATE = "\""; 115 /** 116 * A possible quote character constant. 117 */ 118 public static final String SINGLE_QUATE = "'"; 119 120 /** 121 * Constructs a csv tokenizer for the specified string. <code>theSeparator</code> 122 * argument is the separator for separating tokens. 123 * <p/> 124 * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator 125 * string is also returned as tokens. separator is returned as a string. If the flag is 126 * <code>false</code>, the separator string is skipped and only serve as separator 127 * between tokens. 128 * 129 * @param aString a string to be parsed. 130 * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, 131 * CSVTokenizer.SPACE, etc.). 132 * @param theQuate the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, 133 * etc.). 134 */ 135 public CSVTokenizer (final String aString, final String theSeparator, 136 final String theQuate) 137 { 138 if (aString == null) 139 { 140 throw new NullPointerException("The given string is null"); 141 } 142 if (theSeparator == null) 143 { 144 throw new NullPointerException("The given separator is null"); 145 } 146 if (theQuate == null) 147 { 148 throw new NullPointerException("The given quate is null"); 149 } 150 this.record = aString.trim(); 151 this.separator = theSeparator; 152 this.quate = theQuate; 153 this.currentIndex = 0; 154 this.beforeStart = true; 155 } 156 157 /** 158 * Constructs a csv tokenizer for the specified string. The characters in the 159 * <code>theSeparator</code> argument are the separator for separating tokens. Separator 160 * string themselves will not be treated as tokens. 161 * 162 * @param aString a string to be parsed. 163 * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, 164 * CSVTokenizer.SPACE, etc.). 165 */ 166 public CSVTokenizer (final String aString, final String theSeparator) 167 { 168 this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE); 169 } 170 171 /** 172 * Constructs a string tokenizer for the specified string. The tokenizer uses the 173 * default separator set, which is <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator 174 * string themselves will not be treated as tokens. 175 * 176 * @param aString a string to be parsed. 177 */ 178 public CSVTokenizer (final String aString) 179 { 180 this(aString, CSVTokenizer.SEPARATOR_COMMA); 181 } 182 183 /** 184 * Tests if there are more tokens available from this tokenizer's string. If this method 185 * returns <tt>true</tt>, then a subsequent call to <tt>nextToken</tt> with no argument 186 * will successfully return a token. 187 * 188 * @return <code>true</code> if and only if there is at least one token in the string 189 * after the current position; <code>false</code> otherwise. 190 */ 191 public boolean hasMoreTokens () 192 { 193 return (this.currentIndex < this.record.length()); 194 } 195 196 /** 197 * Returns the next token from this string tokenizer. 198 * 199 * @return the next token from this string tokenizer. 200 * 201 * @throws NoSuchElementException if there are no more tokens in this tokenizer's 202 * string. 203 * @throws IllegalArgumentException if given parameter string format was wrong 204 */ 205 public String nextToken () 206 throws NoSuchElementException, IllegalArgumentException 207 { 208 209 if (!this.hasMoreTokens()) 210 { 211 throw new NoSuchElementException(); 212 } 213 214 if (beforeStart == false) 215 { 216 currentIndex += this.separator.length(); 217 } 218 else 219 { 220 beforeStart = false; 221 } 222 223 StringBuffer token = new StringBuffer(); 224 if (this.record.startsWith(this.quate, this.currentIndex)) 225 { 226 String rec = this.record.substring(this.currentIndex + this.quate.length()); 227 token.delete(0, token.length()); 228 while (true) 229 { 230 final int end = rec.indexOf(this.quate); 231 if (end < 0) 232 { 233 throw new IllegalArgumentException("Illegal format"); 234 } 235 236 if (!rec.startsWith(this.quate, end + 1)) 237 { 238 token.append(rec.substring(0, end)); 239 break; 240 } 241 token.append(rec.substring(0, end + 1)); 242 rec = rec.substring(end + this.quate.length() * 2); 243 this.currentIndex++; 244 } 245 246 this.currentIndex += (token.length() + this.quate.length() * 2); 247 } 248 else 249 { 250 final int end = this.record.indexOf(this.separator, this.currentIndex); 251 if (end >= 0) 252 { 253 final int start = this.currentIndex; 254 token.delete(0, token.length()); 255 token.append(this.record.substring(start, end)); 256 this.currentIndex = end; 257 } 258 else 259 { 260 final int start = this.currentIndex; 261 token.delete(0, token.length()); 262 token.append(this.record.substring(start)); 263 this.currentIndex = this.record.length(); 264 } 265 } 266 267 return token.toString(); 268 } 269 270 /** 271 * Returns the next token in this string tokenizer's string. First, the set of 272 * characters considered to be separator by this <tt>CSVTokenizer</tt> object is changed 273 * to be the characters in the string <tt>separator</tt>. Then the next token in the 274 * string after the current position is returned. The current position is advanced 275 * beyond the recognized token. The new delimiter set remains the default after this 276 * call. 277 * 278 * @param theSeparator the new separator. 279 * @return the next token, after switching to the new delimiter set. 280 * 281 * @throws java.util.NoSuchElementException 282 * if there are no more tokens in this tokenizer's string. 283 */ 284 public String nextToken (final String theSeparator) 285 { 286 separator = theSeparator; 287 return nextToken(); 288 } 289 290 /** 291 * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that 292 * this class can implement the <code>Enumeration</code> interface. 293 * 294 * @return <code>true</code> if there are more tokens; <code>false</code> otherwise. 295 * 296 * @see java.util.Enumeration 297 * @see org.jfree.report.util.CSVTokenizer#hasMoreTokens() 298 */ 299 public boolean hasMoreElements () 300 { 301 return hasMoreTokens(); 302 } 303 304 /** 305 * Returns the same value as the <code>nextToken</code> method, except that its declared 306 * return value is <code>Object</code> rather than <code>String</code>. It exists so 307 * that this class can implement the <code>Enumeration</code> interface. 308 * 309 * @return the next token in the string. 310 * 311 * @throws java.util.NoSuchElementException 312 * if there are no more tokens in this tokenizer's string. 313 * @see java.util.Enumeration 314 * @see org.jfree.report.util.CSVTokenizer#nextToken() 315 */ 316 public Object nextElement () 317 { 318 return nextToken(); 319 } 320 321 /** 322 * Calculates the number of times that this tokenizer's <code>nextToken</code> method 323 * can be called before it generates an exception. The current position is not 324 * advanced. 325 * 326 * @return the number of tokens remaining in the string using the current delimiter 327 * set. 328 * 329 * @see org.jfree.report.util.CSVTokenizer#nextToken() 330 */ 331 public int countTokens () 332 { 333 int count = 0; 334 335 final int preserve = this.currentIndex; 336 final boolean preserveStart = this.beforeStart; 337 while (this.hasMoreTokens()) 338 { 339 this.nextToken(); 340 count++; 341 } 342 this.currentIndex = preserve; 343 this.beforeStart = preserveStart; 344 345 return count; 346 } 347 348 /** 349 * Returns the quate. 350 * 351 * @return char 352 */ 353 public String getQuate () 354 { 355 return this.quate; 356 } 357 358 /** 359 * Sets the quate. 360 * 361 * @param quate The quate to set 362 */ 363 public void setQuate (final String quate) 364 { 365 this.quate = quate; 366 } 367 }