001    /**
002     * ========================================
003     * JFreeReport : a free Java report library
004     * ========================================
005     *
006     * Project Info:  http://reporting.pentaho.org/
007     *
008     * (C) Copyright 2000-2007, by Object Refinery Limited, Pentaho Corporation and Contributors.
009     *
010     * This library is free software; you can redistribute it and/or modify it under the terms
011     * of the GNU Lesser General Public License as published by the Free Software Foundation;
012     * either version 2.1 of the License, or (at your option) any later version.
013     *
014     * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
015     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
016     * See the GNU Lesser General Public License for more details.
017     *
018     * You should have received a copy of the GNU Lesser General Public License along with this
019     * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
020     * Boston, MA 02111-1307, USA.
021     *
022     * [Java is a trademark or registered trademark of Sun Microsystems, Inc.
023     * in the United States and other countries.]
024     *
025     * ------------
026     * $Id: CSVTokenizer.java 3525 2007-10-16 11:43:48Z tmorgner $
027     * ------------
028     * (C) Copyright 2000-2005, by Object Refinery Limited.
029     * (C) Copyright 2005-2007, by Pentaho Corporation.
030     */
031    package org.jfree.report.util;
032    
033    import java.util.Enumeration;
034    import java.util.NoSuchElementException;
035    
036    /**
037     * The csv tokenizer class allows an application to break a Comma Separated Value format
038     * into tokens. The tokenization method is much simpler than the one used by the
039     * <code>StringTokenizer</code> class. The <code>CSVTokenizer</code> methods do not
040     * distinguish among identifiers, numbers, and quoted strings, nor do they recognize and
041     * skip comments.
042     * <p/>
043     * The set of separator (the characters that separate tokens) may be specified either at
044     * creation time or on a per-token basis.
045     * <p/>
046     * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on
047     * whether it was created with the <code>returnSeparators</code> flag having the value
048     * <code>true</code> or <code>false</code>: <ul> <li>If the flag is <code>false</code>,
049     * delimiter characters serve to separate tokens. A token is a maximal sequence of
050     * consecutive characters that are not separator. <li>If the flag is <code>true</code>,
051     * delimiter characters are themselves considered to be tokens. A token is thus either one
052     * delimiter character, or a maximal sequence of consecutive characters that are not
053     * separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current
054     * position within the string to be tokenized. Some operations advance this current
055     * position past the characters processed.<p> A token is returned by taking a substring of
056     * the string that was used to create the <tt>CSVTokenizer</tt> object.
057     * <p/>
058     * The following is one example of the use of the tokenizer. The code:
059     * <blockquote><pre>
060     *     CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
061     *     while (csvt.hasMoreTokens()) {
062     *         println(csvt.nextToken());
063     *     }
064     * </pre></blockquote>
065     * <p/>
066     * prints the following output:
067     * <blockquote><pre>
068     *     this
069     *     is
070     *     a
071     *     test
072     * </pre></blockquote>
073     *
074     * @author abupon
075     */
076    public class CSVTokenizer implements Enumeration
077    {
078      /**
079       * The complete record that should be separated into elements.
080       */
081      private String record;
082      /**
083       * The separator.
084       */
085      private String separator;
086      /**
087       * The quoting char.
088       */
089      private String quate;
090    
091      /**
092       * the current parsing position.
093       */
094      private int currentIndex;
095    
096      private boolean beforeStart;
097    
098      /**
099       * A possible separator constant.
100       */
101      public static final String SEPARATOR_COMMA = ",";
102      /**
103       * A possible separator constant.
104       */
105      public static final String SEPARATOR_TAB = "\t";
106      /**
107       * A possible separator constant.
108       */
109      public static final String SEPARATOR_SPACE = " ";
110    
111      /**
112       * A possible quote character constant.
113       */
114      public static final String DOUBLE_QUATE = "\"";
115      /**
116       * A possible quote character constant.
117       */
118      public static final String SINGLE_QUATE = "'";
119    
120      /**
121       * Constructs a csv tokenizer for the specified string. <code>theSeparator</code>
122       * argument is the separator for separating tokens.
123       * <p/>
124       * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator
125       * string is also returned as tokens. separator is returned as a string. If the flag is
126       * <code>false</code>, the separator string is skipped and only serve as separator
127       * between tokens.
128       *
129       * @param aString      a string to be parsed.
130       * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB,
131       *                     CSVTokenizer.SPACE, etc.).
132       * @param theQuate     the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE,
133       *                     etc.).
134       */
135      public CSVTokenizer (final String aString, final String theSeparator,
136                           final String theQuate)
137      {
138        if (aString == null)
139        {
140          throw new NullPointerException("The given string is null");
141        }
142        if (theSeparator == null)
143        {
144          throw new NullPointerException("The given separator is null");
145        }
146        if (theQuate == null)
147        {
148          throw new NullPointerException("The given quate is null");
149        }
150        this.record = aString.trim();
151        this.separator = theSeparator;
152        this.quate = theQuate;
153        this.currentIndex = 0;
154        this.beforeStart = true;
155      }
156    
157      /**
158       * Constructs a csv tokenizer for the specified string. The characters in the
159       * <code>theSeparator</code> argument are the separator for separating tokens. Separator
160       * string themselves will not be treated as tokens.
161       *
162       * @param aString      a string to be parsed.
163       * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB,
164       *                     CSVTokenizer.SPACE, etc.).
165       */
166      public CSVTokenizer (final String aString, final String theSeparator)
167      {
168        this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE);
169      }
170    
171      /**
172       * Constructs a string tokenizer for the specified string. The tokenizer uses the
173       * default separator set, which is <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator
174       * string themselves will not be treated as tokens.
175       *
176       * @param aString a string to be parsed.
177       */
178      public CSVTokenizer (final String aString)
179      {
180        this(aString, CSVTokenizer.SEPARATOR_COMMA);
181      }
182    
183      /**
184       * Tests if there are more tokens available from this tokenizer's string. If this method
185       * returns <tt>true</tt>, then a subsequent call to <tt>nextToken</tt> with no argument
186       * will successfully return a token.
187       *
188       * @return <code>true</code> if and only if there is at least one token in the string
189       *         after the current position; <code>false</code> otherwise.
190       */
191      public boolean hasMoreTokens ()
192      {
193        return (this.currentIndex < this.record.length());
194      }
195    
196      /**
197       * Returns the next token from this string tokenizer.
198       *
199       * @return the next token from this string tokenizer.
200       *
201       * @throws NoSuchElementException   if there are no more tokens in this tokenizer's
202       *                                  string.
203       * @throws IllegalArgumentException if given parameter string format was wrong
204       */
205      public String nextToken ()
206              throws NoSuchElementException, IllegalArgumentException
207      {
208    
209        if (!this.hasMoreTokens())
210        {
211          throw new NoSuchElementException();
212        }
213    
214        if (beforeStart == false)
215        {
216          currentIndex += this.separator.length();
217        }
218        else
219        {
220          beforeStart = false;
221        }
222    
223        StringBuffer token = new StringBuffer();
224        if (this.record.startsWith(this.quate, this.currentIndex))
225        {
226          String rec = this.record.substring(this.currentIndex + this.quate.length());
227          token.delete(0, token.length());
228          while (true)
229          {
230            final int end = rec.indexOf(this.quate);
231            if (end < 0)
232            {
233              throw new IllegalArgumentException("Illegal format");
234            }
235    
236            if (!rec.startsWith(this.quate, end + 1))
237            {
238              token.append(rec.substring(0, end));
239              break;
240            }
241            token.append(rec.substring(0, end + 1));
242            rec = rec.substring(end + this.quate.length() * 2);
243            this.currentIndex++;
244          }
245    
246          this.currentIndex += (token.length() + this.quate.length() * 2);
247        }
248        else
249        {
250          final int end = this.record.indexOf(this.separator, this.currentIndex);
251          if (end >= 0)
252          {
253            final int start = this.currentIndex;
254            token.delete(0, token.length());
255            token.append(this.record.substring(start, end));
256            this.currentIndex = end;
257          }
258          else
259          {
260            final int start = this.currentIndex;
261            token.delete(0, token.length());
262            token.append(this.record.substring(start));
263            this.currentIndex = this.record.length();
264          }
265        }
266    
267        return token.toString();
268      }
269    
270      /**
271       * Returns the next token in this string tokenizer's string. First, the set of
272       * characters considered to be separator by this <tt>CSVTokenizer</tt> object is changed
273       * to be the characters in the string <tt>separator</tt>. Then the next token in the
274       * string after the current position is returned. The current position is advanced
275       * beyond the recognized token.  The new delimiter set remains the default after this
276       * call.
277       *
278       * @param theSeparator the new separator.
279       * @return the next token, after switching to the new delimiter set.
280       *
281       * @throws java.util.NoSuchElementException
282       *          if there are no more tokens in this tokenizer's string.
283       */
284      public String nextToken (final String theSeparator)
285      {
286        separator = theSeparator;
287        return nextToken();
288      }
289    
290      /**
291       * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that
292       * this class can implement the <code>Enumeration</code> interface.
293       *
294       * @return <code>true</code> if there are more tokens; <code>false</code> otherwise.
295       *
296       * @see java.util.Enumeration
297       * @see org.jfree.report.util.CSVTokenizer#hasMoreTokens()
298       */
299      public boolean hasMoreElements ()
300      {
301        return hasMoreTokens();
302      }
303    
304      /**
305       * Returns the same value as the <code>nextToken</code> method, except that its declared
306       * return value is <code>Object</code> rather than <code>String</code>. It exists so
307       * that this class can implement the <code>Enumeration</code> interface.
308       *
309       * @return the next token in the string.
310       *
311       * @throws java.util.NoSuchElementException
312       *          if there are no more tokens in this tokenizer's string.
313       * @see java.util.Enumeration
314       * @see org.jfree.report.util.CSVTokenizer#nextToken()
315       */
316      public Object nextElement ()
317      {
318        return nextToken();
319      }
320    
321      /**
322       * Calculates the number of times that this tokenizer's <code>nextToken</code> method
323       * can be called before it generates an exception. The current position is not
324       * advanced.
325       *
326       * @return the number of tokens remaining in the string using the current delimiter
327       *         set.
328       *
329       * @see org.jfree.report.util.CSVTokenizer#nextToken()
330       */
331      public int countTokens ()
332      {
333        int count = 0;
334    
335        final int preserve = this.currentIndex;
336        final boolean preserveStart = this.beforeStart;
337        while (this.hasMoreTokens())
338        {
339          this.nextToken();
340          count++;
341        }
342        this.currentIndex = preserve;
343        this.beforeStart = preserveStart;
344    
345        return count;
346      }
347    
348      /**
349       * Returns the quate.
350       *
351       * @return char
352       */
353      public String getQuate ()
354      {
355        return this.quate;
356      }
357    
358      /**
359       * Sets the quate.
360       *
361       * @param quate The quate to set
362       */
363      public void setQuate (final String quate)
364      {
365        this.quate = quate;
366      }
367    }