Source for gnu.javax.swing.text.html.parser.support.Parser

   1: /* Parser.java -- HTML parser.
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.javax.swing.text.html.parser.support;
  40: 
  41: import gnu.javax.swing.text.html.parser.htmlAttributeSet;
  42: import gnu.javax.swing.text.html.parser.htmlValidator;
  43: import gnu.javax.swing.text.html.parser.support.low.Constants;
  44: import gnu.javax.swing.text.html.parser.support.low.ParseException;
  45: import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer;
  46: import gnu.javax.swing.text.html.parser.support.low.Token;
  47: import gnu.javax.swing.text.html.parser.support.low.node;
  48: import gnu.javax.swing.text.html.parser.support.low.pattern;
  49: 
  50: import java.io.IOException;
  51: import java.io.Reader;
  52: 
  53: import java.util.Comparator;
  54: import java.util.Set;
  55: import java.util.TreeSet;
  56: import java.util.Vector;
  57: 
  58: import javax.swing.text.ChangedCharSetException;
  59: import javax.swing.text.html.HTML;
  60: import javax.swing.text.html.parser.AttributeList;
  61: import javax.swing.text.html.parser.DTD;
  62: import javax.swing.text.html.parser.DTDConstants;
  63: import javax.swing.text.html.parser.Element;
  64: import javax.swing.text.html.parser.Entity;
  65: import javax.swing.text.html.parser.TagElement;
  66: 
  67: /**
  68:  * <p>A simple error-tolerant HTML parser that uses a DTD document
  69:  * to access data on the possible tokens, arguments and syntax.</p>
  70:  * <p> The parser reads an HTML content from a Reader and calls various
  71:  * notifying methods (which should be overridden in a subclass)
  72:  * when tags or data are encountered.</p>
  73:  * <p>Some HTML elements need no opening or closing tags. The
  74:  * task of this parser is to invoke the tag handling methods also when
  75:  * the tags are not explicitly specified and must be supposed using
  76:  * information, stored in the DTD.
  77:  * For  example, parsing the document
  78:  * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
  79:  * will invoke exactly the handling methods exactly in the same order
  80:  * (and with the same parameters) as if parsing the document: <br>
  81:  * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
  82:  * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
  83:  * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
  84:  * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
  85:  * (supposed tags are given in italics). The parser also supports
  86:  * obsolete elements of HTML syntax.<p>
  87:  * </p>
  88:  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  89:  */
  90: public class Parser
  91:   extends ReaderTokenizer
  92:   implements DTDConstants
  93: {
  94:   /**
  95:    * The current html tag.
  96:    */
  97:   public Token hTag = new Token();
  98: 
  99:   /**
 100:    * The document template description that will be used to parse the documents.
 101:    */
 102:   protected DTD dtd;
 103: 
 104:   /**
 105:    * The value of this field determines whether or not the Parser will be
 106:    * strict in enforcing SGML compatibility. The default value is false,
 107:    * stating that the parser should do everything to parse and get at least
 108:    * some information even from the incorrectly written HTML input.
 109:    */
 110:   protected boolean strict;
 111: 
 112:   /**
 113:    * This fields has positive values in preformatted tags.
 114:    */
 115:   protected int preformatted = 0;
 116: 
 117:   /**
 118:    * The set of the document tags. This field is used for supporting
 119:    * markFirstTime().
 120:    */
 121:   private Set documentTags =
 122:     new TreeSet(new Comparator()
 123:       {
 124:         public int compare(Object a, Object b)
 125:         {
 126:           return ((String) a).compareToIgnoreCase((String) b);
 127:         }
 128:       }
 129:                );
 130: 
 131:   /**
 132:   * The buffer to collect the incremental output like text or coment.
 133:   */
 134:   private StringBuffer buffer = new StringBuffer();
 135: 
 136:   /**
 137:    * The buffer to store the document title.
 138:    */
 139:   private StringBuffer title = new StringBuffer();
 140: 
 141:   /**
 142:    * The current token.
 143:    */
 144:   private Token t;
 145: 
 146:   /**
 147:    * True means that the 'title' tag of this document has
 148:    * already been handled.
 149:    */
 150:   private boolean titleHandled;
 151: 
 152:   /**
 153:    * True means that the 'title' tag is currently open and all
 154:    * text is also added to the title buffer.
 155:    */
 156:   private boolean titleOpen;
 157: 
 158:   /**
 159:    * The attributes of the current HTML element.
 160:    * Package-private to avoid an accessor method.
 161:    */
 162:   htmlAttributeSet attributes =
 163:     htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
 164: 
 165:   /**
 166:    * The validator, controlling the forcible closing of the tags that
 167:    * (in accordance to dtd) are not allowed in the current context.
 168:    */
 169:   private htmlValidator validator;
 170: 
 171:   /**
 172:    * Provides the default values for parameters in the case when these
 173:    * values are defined in the DTD.
 174:    */
 175:   private parameterDefaulter defaulter;
 176: 
 177:   /**
 178:    * The text pre-processor for handling line ends and tabs.
 179:    */
 180:   private textPreProcessor textProcessor = new textPreProcessor();
 181: 
 182:   /**
 183:    * Creates a new Parser that uses the given
 184:    * {@link javax.swing.text.html.parser.DTD }. The only standard way
 185:    * to get an instance of DTD is to construct it manually, filling in
 186:    * all required fields.
 187:    * @param a_dtd The DTD to use. The parser behaviour after passing null
 188:    * as an argument is not documented and may vary between implementations.
 189:    */
 190:   public Parser(DTD a_dtd)
 191:   {
 192:     if (a_dtd == null)
 193:       dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance();
 194:     else
 195:       dtd = a_dtd;
 196: 
 197:     defaulter = new parameterDefaulter(dtd);
 198: 
 199:     validator =
 200:       new htmlValidator(dtd)
 201:         {
 202:           /**
 203:            * Handles the error message. This method must be overridden to pass
 204:            * the message where required.
 205:            * @param msg The message text.
 206:            */
 207:           protected void s_error(String msg)
 208:           {
 209:             error(msg);
 210:           }
 211: 
 212:           /**
 213:            * The method is called when the tag validator decides to close the
 214:            * tag on its own initiative. After reaching the end of stream,
 215:            * The tag validator closes all unclosed elements that are required
 216:            * to have the end (closing) tag.
 217:            *
 218:            * @param element The tag being fictionally (forcibly) closed.
 219:            */
 220:           protected void handleSupposedEndTag(Element tElement)
 221:           {
 222:             // The tag is cloned as the original tElement is the
 223:             // element from the starting tag - may be accidently used
 224:             // somewhere else.
 225:             TagElement tag = makeTag(tElement, true);
 226:             _handleEndTag_remaining(tag);
 227:           }
 228: 
 229:           /**
 230:            * The method is called when the the tag validator decides to open
 231:            * the new tag on its own initiative. The tags, opened in this
 232:            * way, are HTML, HEAD and BODY. The attribute set is temporary
 233:            * assigned to the empty one, the previous value is
 234:            * restored before return.
 235:            *
 236:            * @param element The tag being fictionally (forcibly) closed.
 237:            */
 238:           protected void handleSupposedStartTag(Element tElement)
 239:           {
 240:             TagElement tag = makeTag(tElement, true);
 241:             htmlAttributeSet were = attributes;
 242:             attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
 243:             _handleStartTag(tag);
 244:             attributes = were;
 245:           }
 246:         };
 247:   }
 248: 
 249:   /**
 250:    * Get the attributes of the current tag.
 251:    * @return The attribute set, representing the attributes of the current tag.
 252:    */
 253:   public htmlAttributeSet getAttributes()
 254:   {
 255:     return attributes;
 256:   }
 257: 
 258:   /**
 259:    * Invokes the error handler. The default method in this implementation
 260:    * delegates the call to handleError, also providing the current line.
 261:    */
 262:   public void error(String msg)
 263:   {
 264:     error(msg, getTokenAhead());
 265:   }
 266: 
 267:   public void error(String msg, Token atToken)
 268:   {
 269:     if (atToken != null)
 270:       handleError(atToken.where.beginLine,
 271:                   msg + ": line " + atToken.where.beginLine +
 272:                   ", absolute pos " + atToken.where.startPosition
 273:                  );
 274:     else
 275:       handleError(0, msg);
 276:   }
 277: 
 278:   /**
 279:    * Invokes the error handler. The default method in this implementation
 280:    * delegates the call to error (parm1+": '"+parm2+"'").
 281:    */
 282:   public void error(String msg, String invalid)
 283:   {
 284:     error(msg + ": '" + invalid + "'");
 285:   }
 286: 
 287:   /**
 288:    * Invokes the error handler. The default method in this implementation
 289:    * delegates the call to error (parm1+" "+ parm2+" "+ parm3).
 290:    */
 291:   public void error(String parm1, String parm2, String parm3)
 292:   {
 293:     error(parm1 + " " + parm2 + " " + parm3);
 294:   }
 295: 
 296:   /**
 297:    * Invokes the error handler. The default method in this implementation
 298:    * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
 299:    */
 300:   public void error(String parm1, String parm2, String parm3, String parm4)
 301:   {
 302:     error(parm1 + " " + parm2 + " " + parm3 + " " + parm4);
 303:   }
 304: 
 305:   public void flushAttributes()
 306:   {
 307:   }
 308: 
 309:   /**
 310:    * Parse the HTML text, calling various methods in response to the
 311:    * occurence of the corresponding HTML constructions.
 312:    * @param reader The reader to read the source HTML from.
 313:    * @throws IOException If the reader throws one.
 314:    */
 315:   public synchronized void parse(Reader reader)
 316:                           throws IOException
 317:   {
 318:     reset(reader);
 319:     restart();
 320:     try
 321:       {
 322:         parseDocument();
 323:         validator.closeAll();
 324:       }
 325:     catch (ParseException ex)
 326:       {
 327:         if (ex != null)
 328:           {
 329:             error("Unable to continue parsing the document", ex.getMessage());
 330: 
 331:             Throwable cause = ex.getCause();
 332:             if (cause instanceof IOException)
 333:               throw (IOException) cause;
 334:           }
 335:       }
 336:   }
 337: 
 338:   /**
 339:    * Parses DTD markup declaration. Currently returns null without action.
 340:    * @return null.
 341:    * @throws IOException
 342:    */
 343:   public String parseDTDMarkup()
 344:                         throws IOException
 345:   {
 346:     return null;
 347:   }
 348: 
 349:   /**
 350:    * Parse SGML insertion ( &lt;! ... &gt; ). When the
 351:    * the SGML insertion is found, this method is called, passing
 352:    * SGML in the string buffer as a parameter. The default method
 353:    * returns false without action and can be overridden to
 354:    * implement user - defined SGML support.
 355:    * <p>
 356:    * If you need more information about SGML insertions in HTML documents,
 357:    * the author suggests to read SGML tutorial on
 358:    * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}.
 359:    * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>,
 360:    * Oxford University Press, 688 p, ISBN: 0198537379.
 361:    * </p>
 362:    * @param strBuff
 363:    * @return true if this is a valid DTD markup declaration.
 364:    * @throws IOException
 365:    */
 366:   public boolean parseMarkupDeclarations(StringBuffer strBuff)
 367:                                   throws IOException
 368:   {
 369:     return false;
 370:   }
 371: 
 372:   /**
 373:    * Get the first line of the last parsed token.
 374:    */
 375:   protected int getCurrentLine()
 376:   {
 377:     return hTag.where.beginLine;
 378:   }
 379: 
 380:   /**
 381:    * Read parseable character data, add to buffer.
 382:    * @param clearBuffer If true, buffer if filled by CDATA section,
 383:    * otherwise the section is appended to the existing content of the
 384:    * buffer.
 385:    *
 386:    * @throws ParseException
 387:    */
 388:   protected void CDATA(boolean clearBuffer)
 389:                 throws ParseException
 390:   {
 391:     Token start = hTag = getTokenAhead();
 392: 
 393:     if (clearBuffer)
 394:       buffer.setLength(0);
 395: 
 396:     // Handle expected EOF.
 397:     if (start.kind == EOF)
 398:       return;
 399: 
 400:     read: 
 401:     while (true)
 402:       {
 403:         t = getTokenAhead();
 404:         if (t.kind == EOF)
 405:           {
 406:             error("unexpected eof", t);
 407:             break read;
 408:           }
 409:         else if (t.kind == BEGIN)
 410:           break read;
 411:         else if (t.kind == Constants.ENTITY)
 412:           {
 413:             resolveAndAppendEntity(t);
 414:             getNextToken();
 415:           }
 416:         else
 417:           {
 418:             append(t);
 419:             getNextToken();
 420:           }
 421:       }
 422:     hTag = new Token(start, getTokenAhead(0));
 423:     if (buffer.length() != 0)
 424:       _handleText();
 425:   }
 426: 
 427:   /**
 428:   * Process Comment. This method skips till --> without
 429:   * taking SGML constructs into consideration.  The supported SGML
 430:   * constructs are handled separately.
 431:   */
 432:   protected void Comment()
 433:                   throws ParseException
 434:   {
 435:     buffer.setLength(0);
 436: 
 437:     Token start = hTag = mustBe(BEGIN);
 438:     optional(WS);
 439:     mustBe(EXCLAMATION);
 440:     optional(WS);
 441:     mustBe(DOUBLE_DASH);
 442: 
 443:     Token t;
 444:     Token last;
 445: 
 446:     comment: 
 447:     while (true)
 448:       {
 449:         t = getTokenAhead();
 450:         if (t.kind == EOF)
 451:           {
 452:             handleEOFInComment();
 453:             last = t;
 454:             break comment;
 455:           }
 456:         else if (COMMENT_END.matches(this))
 457:           {
 458:             mustBe(DOUBLE_DASH);
 459:             optional(WS);
 460:             last = mustBe(END);
 461:             break comment;
 462:           }
 463:         else if (COMMENT_TRIPLEDASH_END.matches(this))
 464:           {
 465:             mustBe(DOUBLE_DASH);
 466:             t = mustBe(NUMTOKEN);
 467:             if (t.getImage().equals("-"))
 468:               {
 469:                 append(t);
 470:                 last = mustBe(END);
 471:                 break comment;
 472:               }
 473:             else
 474:               {
 475:                 buffer.append("--");
 476:                 append(t);
 477:                 t = getTokenAhead();
 478:               }
 479:           }
 480:         else
 481:         /* The lllll-- can match as NUMTOKEN */
 482:         if ((t.getImage().endsWith("--")) &&
 483:             (
 484:               getTokenAhead(1).kind == END ||
 485:               (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END)
 486:             )
 487:            )
 488:           {
 489:             buffer.append(t.getImage().substring(0, t.getImage().length() - 2));
 490: 
 491:             /* Skip the closing > that we have already checked. */
 492:             last = mustBe(t.kind);
 493:             break comment;
 494:           }
 495:         else
 496:           append(t);
 497:         mustBe(t.kind);
 498:       }
 499:     hTag = new Token(start, last);
 500:     handleComment();
 501:   }
 502: 
 503:   /**
 504:   * Read a script. The text, returned without any changes,
 505:   * is terminated only by the closing tag SCRIPT.
 506:   */
 507:   protected void Script()
 508:                  throws ParseException
 509:   {
 510:     Token name;
 511: 
 512:     Token start = hTag = mustBe(BEGIN);
 513:     optional(WS);
 514: 
 515:     name = mustBe(SCRIPT);
 516: 
 517:     optional(WS);
 518: 
 519:     restOfTag(false, name, start);
 520: 
 521:     buffer.setLength(0);
 522: 
 523:     script: 
 524:     while (!SCRIPT_CLOSE.matches(this))
 525:       {
 526:         append(getNextToken());
 527:       }
 528: 
 529:     consume(SCRIPT_CLOSE);
 530: 
 531:     _handleText();
 532: 
 533:     endTag(false);
 534:     _handleEndTag(makeTagElement(name.getImage(), false));
 535:   }
 536: 
 537:   /**
 538:   * Process SGML insertion that is not a comment.
 539:   */
 540:   protected void Sgml()
 541:                throws ParseException
 542:   {
 543:     if (COMMENT_OPEN.matches(this))
 544:       Comment();
 545:     else // skip till ">"
 546:       {
 547:         Token start = hTag = mustBe(BEGIN);
 548:         optional(WS);
 549:         mustBe(EXCLAMATION);
 550: 
 551:         buffer.setLength(0);
 552:         read: 
 553:         while (true)
 554:           {
 555:             t = getNextToken();
 556:             if (t.kind == Constants.ENTITY)
 557:               {
 558:                 resolveAndAppendEntity(t);
 559:               }
 560:             else if (t.kind == EOF)
 561:               {
 562:                 error("unexpected eof", t);
 563:                 break read;
 564:               }
 565:             else if (t.kind == END)
 566:               break read;
 567:             else
 568:               append(t);
 569:           }
 570: 
 571:         try
 572:           {
 573:             parseMarkupDeclarations(buffer);
 574:           }
 575:         catch (IOException ex)
 576:           {
 577:             error("Unable to parse SGML insertion: '" + buffer + "'",
 578:                   new Token(start, t)
 579:                  );
 580:           }
 581:       }
 582:   }
 583: 
 584:   /**
 585:   * Read a style definition. The text, returned without any changes,
 586:   * is terminated only by the closing tag STYLE.
 587:   */
 588:   protected void Style()
 589:                 throws ParseException
 590:   {
 591:     Token name;
 592: 
 593:     Token start = hTag = mustBe(BEGIN);
 594:     optional(WS);
 595: 
 596:     name = mustBe(STYLE);
 597: 
 598:     optional(WS);
 599: 
 600:     restOfTag(false, name, start);
 601: 
 602:     buffer.setLength(0);
 603: 
 604:     style: 
 605:     while (!STYLE_CLOSE.matches(this))
 606:       {
 607:         append(getNextToken());
 608:       }
 609: 
 610:     consume(STYLE_CLOSE);
 611: 
 612:     _handleText();
 613: 
 614:     endTag(false);
 615:     _handleEndTag(makeTagElement(name.getImage(), false));
 616:   }
 617: 
 618:   /**
 619:    * Read a html tag.
 620:    */
 621:   protected void Tag()
 622:               throws ParseException
 623:   {
 624:     mark(true);
 625: 
 626:     boolean closing = false;
 627:     Token name;
 628:     Token start = hTag = mustBe(BEGIN);
 629: 
 630:     optional(WS);
 631:     name = getNextToken();
 632:     optional(WS);
 633: 
 634:     if (name.kind == SLASH)
 635:       {
 636:         closing = true;
 637:         name = getNextToken();
 638:       }
 639: 
 640:     restOfTag(closing, name, start);
 641:   }
 642: 
 643:   /**
 644:    * A hook, for operations, preceeding call to handleText.
 645:    * Handle text in a string buffer.
 646:    * In non - preformatted mode, all line breaks immediately following the
 647:    * start tag and immediately before an end tag is discarded,
 648:    * \r, \n and \t are replaced by spaces, multiple space are replaced
 649:    * by the single one and the result is  moved into array,
 650:    * passing it  to handleText().
 651:    */
 652:   protected void _handleText()
 653:   {
 654:     char[] text;
 655: 
 656:     if (preformatted > 0)
 657:       text = textProcessor.preprocessPreformatted(buffer);
 658:     else
 659:       text = textProcessor.preprocess(buffer);
 660: 
 661:     if (text != null && text.length > 0)
 662:       {
 663:         TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
 664:         attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
 665:         _handleEmptyTag(pcdata);
 666: 
 667:         handleText(text);
 668:         if (titleOpen)
 669:           title.append(text);
 670:       }
 671:   }
 672: 
 673:   /**
 674:    * Add the image of this token to the buffer.
 675:    * @param t A token to append.
 676:    */
 677:   protected final void append(Token t)
 678:   {
 679:     if (t.kind != EOF)
 680:       t.appendTo(buffer);
 681:   }
 682: 
 683:   /**
 684:    * Consume pattern that must match.
 685:    * @param p A pattern to consume.
 686:    */
 687:   protected final void consume(pattern p)
 688:   {
 689:     node n;
 690:     for (int i = 0; i < p.nodes.length; i++)
 691:       {
 692:         n = p.nodes [ i ];
 693:         if (n.optional)
 694:           optional(n.kind);
 695:         else
 696:           mustBe(n.kind);
 697:       }
 698:   }
 699: 
 700:   /**
 701:    * The method is called when the HTML end (closing) tag is found or if
 702:    * the parser concludes that the one should be present in the
 703:    * current position. The method is called immediatly
 704:    * before calling the handleEndTag().
 705:    * @param omitted True if the tag is no actually present in the document,
 706:    * but is supposed by the parser (like &lt;/html&gt; at the end of the
 707:    * document).
 708:    */
 709:   protected void endTag(boolean omitted)
 710:   {
 711:   }
 712: 
 713:   /**
 714:    * Handle HTML comment. The default method returns without action.
 715:    * @param comment
 716:    */
 717:   protected void handleComment(char[] comment)
 718:   {
 719:   }
 720: 
 721:   /**
 722:    * This is additionally called in when the HTML content terminates
 723:    * without closing the HTML comment. This can only happen if the
 724:    * HTML document contains errors (for example, the closing --;gt is
 725:    * missing.
 726:    */
 727:   protected void handleEOFInComment()
 728:   {
 729:     error("Unclosed comment");
 730:   }
 731: 
 732:   /**
 733:    * Handle the tag with no content, like &lt;br&gt;. The method is
 734:    * called for the elements that, in accordance with the current DTD,
 735:    * has an empty content.
 736:    * @param The tag being handled.
 737:    * @throws javax.swing.text.ChangedCharSetException
 738:    */
 739:   protected void handleEmptyTag(TagElement tag)
 740:                          throws javax.swing.text.ChangedCharSetException
 741:   {
 742:   }
 743: 
 744:   /**
 745:    * The method is called when the HTML closing tag ((like &lt;/table&gt;)
 746:    * is found or if the parser concludes that the one should be present
 747:    * in the current position.
 748:    * @param The tag
 749:    */
 750:   protected void handleEndTag(TagElement tag)
 751:   {
 752:   }
 753: 
 754:   /* Handle error that has occured in the given line. */
 755:   protected void handleError(int line, String message)
 756:   {
 757:   }
 758: 
 759:   /**
 760:    * The method is called when the HTML opening tag ((like &lt;table&gt;)
 761:    * is found or if the parser concludes that the one should be present
 762:    * in the current position.
 763:    * @param The tag
 764:    */
 765:   protected void handleStartTag(TagElement tag)
 766:   {
 767:   }
 768: 
 769:   /**
 770:    * Handle the text section.
 771:    * <p> For non-preformatted section, the parser replaces
 772:    * \t, \r and \n by spaces and then multiple spaces
 773:    * by a single space. Additionaly, all whitespace around
 774:    * tags is discarded.
 775:    * </p>
 776:    * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
 777:    * all tabs and spaces, but removes <b>one</b>  bounding \r, \n or \r\n,
 778:    * if it is present. Additionally, it replaces each occurence of \r or \r\n
 779:    * by a single \n.</p>
 780:    *
 781:    * @param text A section text.
 782:    */
 783:   protected void handleText(char[] text)
 784:   {
 785:   }
 786: 
 787:   /**
 788:    * Handle HTML &lt;title&gt; tag. This method is invoked when
 789:    * both title starting and closing tags are already behind.
 790:    * The passed argument contains the concatenation of all
 791:    * title text sections.
 792:    * @param The title text.
 793:    */
 794:   protected void handleTitle(char[] title)
 795:   {
 796:   }
 797: 
 798:   /**
 799:    * Constructs the tag from the given element. In this implementation,
 800:    * this is defined, but never called.
 801:    * @return the tag
 802:    */
 803:   protected TagElement makeTag(Element element)
 804:   {
 805:     return makeTag(element, false);
 806:   }
 807: 
 808:   /**
 809:    * Constructs the tag from the given element.
 810:    * @param the tag base {@link javax.swing.text.html.parser.Element}
 811:    * @param isSupposed true if the tag is not actually present in the
 812:    * html input, but the parser supposes that it should to occur in
 813:    * the current location.
 814:    * @return the tag
 815:    */
 816:   protected TagElement makeTag(Element element, boolean isSupposed)
 817:   {
 818:     return new TagElement(element, isSupposed);
 819:   }
 820: 
 821:   /**
 822:    * This is called when the tag, representing the given element,
 823:    * occurs first time in the document.
 824:    * @param element
 825:    */
 826:   protected void markFirstTime(Element element)
 827:   {
 828:   }
 829: 
 830:   /**
 831:    * Consume the token that was checked before and hence MUST be present.
 832:    * @param kind The kind of token to consume.
 833:    */
 834:   protected Token mustBe(int kind)
 835:   {
 836:     if (getTokenAhead().kind == kind)
 837:       return getNextToken();
 838:     else
 839:       {
 840:         String ei = "";
 841:         if (kind < 1000)
 842:           ei = " ('" + (char) kind + "') ";
 843:         throw new AssertionError("The token of kind " + kind + ei +
 844:                                  " MUST be here,"
 845:                                 );
 846:       }
 847:   }
 848: 
 849:   /**
 850:    * Handle attribute without value. The default method uses
 851:    * the only allowed attribute value from DTD.
 852:    * If the attribute is unknown or allows several values,
 853:    * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with
 854:    * this value is added to the attribute set.
 855:    * @param element The name of element.
 856:    * @param attribute The name of attribute without value.
 857:    */
 858:   protected void noValueAttribute(String element, String attribute)
 859:   {
 860:     Object value = HTML.NULL_ATTRIBUTE_VALUE;
 861: 
 862:     Element e = (Element) dtd.elementHash.get(element.toLowerCase());
 863:     if (e != null)
 864:       {
 865:         AttributeList attr = e.getAttribute(attribute);
 866:         if (attr != null)
 867:           {
 868:             Vector values = attr.values;
 869:             if (values != null && values.size() == 1)
 870:               value = values.get(0);
 871:           }
 872:       }
 873:     attributes.addAttribute(attribute, value);
 874:   }
 875: 
 876:   /**
 877:    * Consume the optional token, if present.
 878:    * @param kind The kind of token to consume.
 879:    */
 880:   protected Token optional(int kind)
 881:   {
 882:     if (getTokenAhead().kind == kind)
 883:       return getNextToken();
 884:     else
 885:       return null;
 886:   }
 887: 
 888:   /** Parse the html document. */
 889:   protected void parseDocument()
 890:                         throws ParseException
 891:   {
 892:     while (getTokenAhead().kind != EOF)
 893:       {
 894:         advanced = false;
 895:         if (TAG.matches(this))
 896:           Tag();
 897:         else if (COMMENT_OPEN.matches(this))
 898:           Comment();
 899:         else if (STYLE_OPEN.matches(this))
 900:           Style();
 901:         else if (SCRIPT_OPEN.matches(this))
 902:           Script();
 903:         else if (SGML.matches(this))
 904:           Sgml();
 905:         else
 906:           CDATA(true);
 907: 
 908:         // Surely HTML error, treat as a text.
 909:         if (!advanced)
 910:           {
 911:             Token wrong = getNextToken();
 912:             error("unexpected '" + wrong.getImage() + "'", wrong);
 913:             buffer.setLength(0);
 914:             buffer.append(wrong.getImage());
 915:             _handleText();
 916:           }
 917:       }
 918:   }
 919: 
 920:   /**
 921:    * Read the element attributes, adding them into attribute set.
 922:    * @param element The element name (needed to access attribute
 923:    * information in dtd).
 924:    */
 925:   protected void readAttributes(String element)
 926:   {
 927:     Token name;
 928:     Token value;
 929:     Token next;
 930:     String attrValue;
 931: 
 932:     attributes = new htmlAttributeSet();
 933: 
 934:     optional(WS);
 935: 
 936:     attributeReading: 
 937:       while (getTokenAhead().kind == NUMTOKEN)
 938:       {
 939:         name = getNextToken();
 940:         optional(WS);
 941: 
 942:         next = getTokenAhead();
 943:         if (next.kind == EQ)
 944:           {
 945:             mustBe(EQ);
 946:             optional(WS);
 947: 
 948:             next = getNextToken();
 949: 
 950:             switch (next.kind)
 951:               {
 952:               case QUOT:
 953: 
 954:                 // read "quoted" attribute.
 955:                 buffer.setLength(0);
 956:                 readTillTokenE(QUOT);
 957:                 attrValue = buffer.toString();
 958:                 break;
 959: 
 960:               case AP:
 961: 
 962:                 // read 'quoted' attribute.
 963:                 buffer.setLength(0);
 964:                 readTillTokenE(AP);
 965:                 attrValue = buffer.toString();
 966:                 break;
 967: 
 968:               // read unquoted attribute.
 969:               case NUMTOKEN:
 970:                 value = next;
 971:                 optional(WS);
 972: 
 973:                 // Check maybe the opening quote is missing.
 974:                 next = getTokenAhead();
 975:                 if (bQUOTING.get(next.kind))
 976:                   {
 977:                     hTag = next;
 978:                     error("The value without opening quote is closed with '"
 979:                           + next.getImage() + "'");
 980:                     attrValue = value.getImage();
 981:                   }
 982:                 else if (next.kind == SLASH)
 983:                 // The slash in this context is treated as the ordinary
 984:                 // character, not as a token. The slash may be part of
 985:                 // the unquoted URL.
 986:                   {
 987:                     StringBuffer image = new StringBuffer(value.getImage());
 988:                     while (next.kind == NUMTOKEN || next.kind == SLASH)
 989:                       {
 990:                         image.append(getNextToken().getImage());
 991:                         next = getTokenAhead();
 992:                       }
 993:                     attrValue = image.toString();
 994:                   }
 995:                 else
 996:                   attrValue = value.getImage();
 997:                 break;
 998: 
 999:               case SLASH:
1000:                 value = next;
1001:                 optional(WS);
1002:                 
1003:                 // Check maybe the opening quote is missing.
1004:                 next = getTokenAhead();
1005:                 if (bQUOTING.get(next.kind))
1006:                   {
1007:                     hTag = next;
1008:                     error("The value without opening quote is closed with '"
1009:                           + next.getImage() + "'");
1010:                     attrValue = value.getImage();
1011:                   }
1012:                 else if (next.kind == NUMTOKEN || next.kind == SLASH)
1013:                 // The slash in this context is treated as the ordinary
1014:                 // character, not as a token. The slash may be part of
1015:                 // the unquoted URL.
1016:                   {
1017:                     StringBuffer image = new StringBuffer(value.getImage());
1018:                     while (next.kind == NUMTOKEN || next.kind == SLASH)
1019:                       {
1020:                         image.append(getNextToken().getImage());
1021:                         next = getTokenAhead();
1022:                       }
1023:                     attrValue = image.toString();
1024:                   }
1025:                 else
1026:                   attrValue = value.getImage();
1027:                 break;
1028:               default:
1029:                 break attributeReading;
1030:               }
1031:             attributes.addAttribute(name.getImage(), attrValue);
1032:             optional(WS);
1033:           }
1034:         else
1035:           // The '=' is missing: attribute without value.
1036:           {
1037:             noValueAttribute(element, name.getImage());
1038:           }
1039:       }
1040:   }
1041: 
1042:   /**
1043:    * Return string, corresponding the given named entity. The name is passed
1044:    * with the preceeding &, but without the ending semicolon.
1045:    */
1046:   protected String resolveNamedEntity(final String a_tag)
1047:   {
1048:     // Discard &
1049:     if (!a_tag.startsWith("&"))
1050:       throw new AssertionError("Named entity " + a_tag +
1051:                                " must start witn '&'."
1052:                               );
1053: 
1054:     String tag = a_tag.substring(1);
1055: 
1056:     try
1057:       {
1058:         Entity entity = dtd.getEntity(tag);
1059:         if (entity != null)
1060:           return entity.getString();
1061: 
1062:         entity = dtd.getEntity(tag.toLowerCase());
1063: 
1064:         if (entity != null)
1065:           {
1066:             error("The name of this entity should be in lowercase", a_tag);
1067:             return entity.getString();
1068:           }
1069:       }
1070:     catch (IndexOutOfBoundsException ibx)
1071:       {
1072:         /* The error will be reported. */
1073:       }
1074: 
1075:     error("Unknown named entity", a_tag);
1076:     return a_tag;
1077:   }
1078: 
1079:   /**
1080:    * Return char, corresponding the given numeric entity.
1081:    * The name is passed with the preceeding &#, but without
1082:    * the ending semicolon.
1083:    */
1084:   protected char resolveNumericEntity(final String a_tag)
1085:   {
1086:     // Discard &#
1087:     if (!a_tag.startsWith("&#"))
1088:       throw new AssertionError("Numeric entity " + a_tag +
1089:                                " must start witn '&#'."
1090:                               );
1091: 
1092:     String tag = a_tag.substring(2);
1093: 
1094:     try
1095:       {
1096:         // Determine the encoding type:
1097:         char cx = tag.charAt(0);
1098:         if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn;
1099: 
1100:           return (char) Integer.parseInt(tag.substring(1), 16);
1101: 
1102:         return (char) Integer.parseInt(tag);
1103:       }
1104: 
1105:     /* The error will be reported. */
1106:     catch (NumberFormatException nex)
1107:       {
1108:       }
1109:     catch (IndexOutOfBoundsException ix)
1110:       {
1111:       }
1112: 
1113:     error("Invalid numeric entity", a_tag);
1114:     return '?';
1115:   }
1116: 
1117:   /**
1118:    * Reset all fields into the intial default state, preparing the
1119:    * parset for parsing the next document.
1120:    */
1121:   protected void restart()
1122:   {
1123:     documentTags.clear();
1124:     titleHandled = false;
1125:     titleOpen = false;
1126:     buffer.setLength(0);
1127:     title.setLength(0);
1128:     validator.restart();
1129:   }
1130: 
1131:   /**
1132:    * The method is called when the HTML opening tag ((like &lt;table&gt;)
1133:    * is found or if the parser concludes that the one should be present
1134:    * in the current position. The method is called immediately before
1135:    * calling the handleStartTag.
1136:    * @param The tag
1137:    */
1138:   protected void startTag(TagElement tag)
1139:                    throws ChangedCharSetException
1140:   {
1141:   }
1142: 
1143:   /**
1144:    * Handle a complete element, when the tag content is already present in the
1145:    * buffer and both starting and heading tags behind. This is called
1146:    * in the case when the tag text must not be parsed for the nested
1147:    * elements (elements STYLE and SCRIPT).
1148:    */
1149:   private void _handleCompleteElement(TagElement tag)
1150:   {
1151:     _handleStartTag(tag);
1152: 
1153:     // Suppress inclusion of the SCRIPT ans STYLE texts into the title.
1154:     HTML.Tag h = tag.getHTMLTag();
1155:     if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE)
1156:       {
1157:         boolean tmp = titleOpen;
1158:         titleOpen = false;
1159:         _handleText();
1160:         titleOpen = tmp;
1161:       }
1162:     else
1163:       _handleText();
1164: 
1165:     _handleEndTag(tag);
1166:   }
1167: 
1168:   /**
1169:    * A hooks for operations, preceeding call to handleEmptyTag().
1170:    * Handle the tag with no content, like &lt;br&gt;. As no any
1171:    * nested tags are expected, the tag validator is not involved.
1172:    * @param The tag being handled.
1173:    */
1174:   private void _handleEmptyTag(TagElement tag)
1175:   {
1176:     try
1177:       {
1178:         validator.validateTag(tag, attributes);
1179:         handleEmptyTag(tag);
1180:       }
1181:     catch (ChangedCharSetException ex)
1182:       {
1183:         error("Changed charset exception:", ex.getMessage());
1184:       }
1185:   }
1186: 
1187:   /**
1188:    * A hooks for operations, preceeding call to handleEndTag().
1189:    * The method is called when the HTML closing tag
1190:    * is found. Calls handleTitle after closing the 'title' tag.
1191:    * @param The tag
1192:    */
1193:   private void _handleEndTag(TagElement tag)
1194:   {
1195:     validator.closeTag(tag);
1196:     _handleEndTag_remaining(tag);
1197:   }
1198: 
1199:   /**
1200:    * Actions that are also required if the closing action was
1201:    * initiated by the tag validator.
1202:    * Package-private to avoid an accessor method.
1203:    */
1204:   void _handleEndTag_remaining(TagElement tag)
1205:   {
1206:     HTML.Tag h = tag.getHTMLTag();
1207: 
1208:     handleEndTag(tag);
1209:     endTag(tag.fictional());
1210: 
1211:     if (h.isPreformatted())
1212:       preformatted--;
1213:     if (preformatted < 0)
1214:       preformatted = 0;
1215: 
1216:     if (h == HTML.Tag.TITLE)
1217:       {
1218:         titleOpen = false;
1219:         titleHandled = true;
1220: 
1221:         char[] a = new char[ title.length() ];
1222:         title.getChars(0, a.length, a, 0);
1223:         handleTitle(a);
1224:       }
1225:   }
1226: 
1227:   /**
1228:    * A hooks for operations, preceeding call to handleStartTag().
1229:    * The method is called when the HTML opening tag ((like &lt;table&gt;)
1230:    * is found.
1231:    * Package-private to avoid an accessor method.
1232:    * @param The tag
1233:    */
1234:   void _handleStartTag(TagElement tag)
1235:   {
1236:     validator.openTag(tag, attributes);
1237:     startingTag(tag);
1238:     handleStartTag(tag);
1239: 
1240:     HTML.Tag h = tag.getHTMLTag();
1241: 
1242:     if (h.isPreformatted())
1243:       preformatted++;
1244: 
1245:     if (h == HTML.Tag.TITLE)
1246:       {
1247:         if (titleHandled)
1248:           error("Repetetive <TITLE> tag");
1249:         titleOpen = true;
1250:         titleHandled = false;
1251:       }
1252:   }
1253: 
1254:   /**
1255:    * Resume parsing after heavy errors in HTML tag structure.
1256:    * @throws ParseException
1257:    */
1258:   private void forciblyCloseTheTag()
1259:                             throws ParseException
1260:   {
1261:     int closeAt = 0;
1262:     buffer.setLength(0);
1263: 
1264:     ahead: 
1265:     for (int i = 1; i < 100; i++)
1266:       {
1267:         t = getTokenAhead(i - 1);
1268:         if (t.kind == EOF || t.kind == BEGIN)
1269:           break ahead;
1270:         if (t.kind == END)
1271:           {
1272:             /* Closing '>' found. */
1273:             closeAt = i;
1274:             break ahead;
1275:           }
1276:       }
1277:     if (closeAt > 0)
1278:       {
1279:         buffer.append("Ignoring '");
1280:         for (int i = 1; i <= closeAt; i++)
1281:           {
1282:             t = getNextToken();
1283:             append(t);
1284:           }
1285:         buffer.append('\'');
1286:         error(buffer.toString());
1287:       }
1288:   }
1289: 
1290:   /**
1291:    * Handle comment in string buffer. You can avoid allocating a char
1292:    * array each time by processing your comment directly here.
1293:    */
1294:   private void handleComment()
1295:   {
1296:     char[] a = new char[ buffer.length() ];
1297:     buffer.getChars(0, a.length, a, 0);
1298:     handleComment(a);
1299:   }
1300: 
1301:   private TagElement makeTagElement(String name, boolean isSupposed)
1302:   {
1303:     Element e = (Element) dtd.elementHash.get(name.toLowerCase());
1304:     if (e == null)
1305:       {
1306:         error("Unknown tag <" + name + ">");
1307:         e = dtd.getElement(name);
1308:         e.name = name.toUpperCase();
1309:         e.index = -1;
1310:       }
1311: 
1312:     if (!documentTags.contains(e.name))
1313:       {
1314:         markFirstTime(e);
1315:         documentTags.add(e.name);
1316:       }
1317: 
1318:     return makeTag(e, isSupposed);
1319:   }
1320: 
1321:   /**
1322:    * Read till the given token, resolving entities. Consume the given
1323:    * token without adding it to buffer.
1324:    * @param till The token to read till
1325:    * @throws ParseException
1326:    */
1327:   private void readTillTokenE(int till)
1328:                        throws ParseException
1329:   {
1330:     buffer.setLength(0);
1331:     read: 
1332:     while (true)
1333:       {
1334:         t = getNextToken();
1335:         if (t.kind == Constants.ENTITY)
1336:           {
1337:             resolveAndAppendEntity(t);
1338:           }
1339:         else if (t.kind == EOF)
1340:           {
1341:             error("unexpected eof", t);
1342:             break read;
1343:           }
1344:         else if (t.kind == till)
1345:           break read;
1346:         else if (t.kind == WS)
1347:           {
1348:             // Processing whitespace in accordance with CDATA rules:
1349:             String s = t.getImage();
1350:             char c;
1351:             for (int i = 0; i < s.length(); i++)
1352:               {
1353:                 c = s.charAt(i);
1354:                 if (c == '\r')
1355:                   buffer.append(' '); // CR replaced by space
1356:                 else if (c == '\n')
1357:                   ; // LF ignored
1358:                 else if (c == '\t')
1359:                   buffer.append(' '); // Tab replaced by space
1360:                 else
1361:                   buffer.append(c);
1362:               }
1363:           }
1364:         else
1365:           append(t);
1366:       }
1367:   }
1368: 
1369:   /**
1370:    * Resolve the entity and append it to the end of buffer.
1371:    * @param entity
1372:    */
1373:   private void resolveAndAppendEntity(Token entity)
1374:   {
1375:     switch (entity.category)
1376:       {
1377:         case ENTITY_NAMED :
1378:           buffer.append(resolveNamedEntity(entity.getImage()));
1379:           break;
1380: 
1381:         case ENTITY_NUMERIC :
1382:           buffer.append(resolveNumericEntity(entity.getImage()));
1383:           break;
1384: 
1385:         default :
1386:           throw new AssertionError("Invalid entity category " +
1387:                                    entity.category
1388:                                   );
1389:       }
1390:   }
1391: 
1392:   /**
1393:    * Handle the remaining of HTML tags. This is a common end for
1394:    * TAG, SCRIPT and STYLE.
1395:    * @param closing True for closing tags ( &lt;/TAG&gt; ).
1396:    * @param name Name of element
1397:    * @param start Token where element has started
1398:    * @throws ParseException
1399:    */
1400:   private void restOfTag(boolean closing, Token name, Token start)
1401:                   throws ParseException
1402:   {
1403:     boolean end = false;
1404:     Token next;
1405: 
1406:     optional(WS);
1407: 
1408:     readAttributes(name.getImage());
1409: 
1410:     optional(WS);
1411: 
1412:     next = getTokenAhead();
1413:     if (next.kind == END)
1414:       {
1415:         mustBe(END);
1416:         end = true;
1417:       }
1418: 
1419:     hTag = new Token(start, next);
1420: 
1421:     attributes.setResolveParent(defaulter.getDefaultParameters(name.getImage()));
1422: 
1423:     if (!end)
1424:       {
1425:         // The tag body contains errors. If additionally the tag
1426:         // name is not valid, this construction is treated as text.
1427:         if (dtd.elementHash.get(name.getImage().toLowerCase()) == null &&
1428:             backupMode
1429:            )
1430:           {
1431:             error("Errors in tag body and unknown tag name. " +
1432:                   "Treating the tag as a text."
1433:                  );
1434:             reset();
1435: 
1436:             hTag = mustBe(BEGIN);
1437:             buffer.setLength(0);
1438:             buffer.append(hTag.getImage());
1439:             CDATA(false);
1440:             return;
1441:           }
1442:         else
1443:           {
1444:             error("Forcibly closing invalid parameter list");
1445:             forciblyCloseTheTag();
1446:           }
1447:       }
1448: 
1449:     if (closing)
1450:       {
1451:         endTag(false);
1452:         _handleEndTag(makeTagElement(name.getImage(), false));
1453:       }
1454:     else
1455:       {
1456:         TagElement te = makeTagElement(name.getImage(), false);
1457:         if (te.getElement().type == DTDConstants.EMPTY)
1458:           _handleEmptyTag(te);
1459:         else
1460:           _handleStartTag(te);
1461:       }
1462:   }
1463: 
1464:   /**
1465:    * This should fire additional actions in response to the
1466:    * ChangedCharSetException.  The current implementation
1467:    * does nothing.
1468:    * @param tag
1469:    */
1470:   private void startingTag(TagElement tag)
1471:   {
1472:     try
1473:       {
1474:         startTag(tag);
1475:       }
1476:     catch (ChangedCharSetException cax)
1477:       {
1478:         error("Invalid change of charset");
1479:       }
1480:   }
1481: 
1482:   private void ws_error()
1483:   {
1484:     error("Whitespace here is not permitted");
1485:   }
1486: }