Source for gnu.xml.util.XMLWriter

   1: /* XMLWriter.java -- 
   2:    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.xml.util;
  39: 
  40: import java.io.BufferedWriter;
  41: import java.io.CharConversionException;
  42: import java.io.IOException;
  43: import java.io.OutputStream;
  44: import java.io.OutputStreamWriter;
  45: import java.io.Writer;
  46: import java.util.Stack;
  47: 
  48: import org.xml.sax.*;
  49: import org.xml.sax.ext.*;
  50: import org.xml.sax.helpers.*;
  51: 
  52: 
  53: /**
  54:  * This class is a SAX handler which writes all its input as a well formed
  55:  * XML or XHTML document.  If driven using SAX2 events, this output may
  56:  * include a recreated document type declaration, subject to limitations
  57:  * of SAX (no internal subset exposed) or DOM (the important declarations,
  58:  * with their documentation, are discarded).
  59:  *
  60:  * <p> By default, text is generated "as-is", but some optional modes
  61:  * are supported.  Pretty-printing is supported, to make life easier
  62:  * for people reading the output.  XHTML (1.0) output has can be made
  63:  * particularly pretty; all the built-in character entities are known.
  64:  * Canonical XML can also be generated, assuming the input is properly
  65:  * formed.
  66:  *
  67:  * <hr>
  68:  *
  69:  * <p> Some of the methods on this class are intended for applications to
  70:  * use directly, rather than as pure SAX2 event callbacks.  Some of those
  71:  * methods access the JavaBeans properties (used to tweak output formats,
  72:  * for example canonicalization and pretty printing).  Subclasses
  73:  * are expected to add new behaviors, not to modify current behavior, so
  74:  * many such methods are final.</p>
  75:  *
  76:  * <p> The <em>write*()</em> methods may be slightly simpler for some
  77:  * applications to use than direct callbacks.  For example, they support
  78:  * a simple policy for encoding data items as the content of a single element.
  79:  *
  80:  * <p> To reuse an XMLWriter you must provide it with a new Writer, since
  81:  * this handler closes the writer it was given as part of its endDocument()
  82:  * handling.  (XML documents have an end of input, and the way to encode
  83:  * that on a stream is to close it.) </p>
  84:  *
  85:  * <hr>
  86:  *
  87:  * <p> Note that any relative URIs in the source document, as found in
  88:  * entity and notation declarations, ought to have been fully resolved by
  89:  * the parser providing events to this handler.  This means that the
  90:  * output text should only have fully resolved URIs, which may not be
  91:  * the desired behavior in cases where later binding is desired. </p>
  92:  *
  93:  * <p> <em>Note that due to SAX2 defaults, you may need to manually
  94:  * ensure that the input events are XML-conformant with respect to namespace
  95:  * prefixes and declarations.  {@link gnu.xml.pipeline.NSFilter} is
  96:  * one solution to this problem, in the context of processing pipelines.</em>
  97:  * Something as simple as connecting this handler to a parser might not
  98:  * generate the correct output.  Another workaround is to ensure that the
  99:  * <em>namespace-prefixes</em> feature is always set to true, if you're
 100:  * hooking this directly up to some XMLReader implementation.
 101:  *
 102:  * @see gnu.xml.pipeline.TextConsumer
 103:  *
 104:  * @author David Brownell
 105:  *
 106:  * @deprecated Please use the javax.xml.stream APIs instead
 107:  */
 108: public class XMLWriter
 109:     implements ContentHandler, LexicalHandler, DTDHandler, DeclHandler
 110: {
 111:     // text prints/escapes differently depending on context
 112:     //    CTX_ENTITY ... entity literal value
 113:     //    CTX_ATTRIBUTE ... attribute literal value
 114:     //    CTX_CONTENT ... content of an element
 115:     //    CTX_UNPARSED ... CDATA, comment, PI, names, etc
 116:     //  CTX_NAME ... name or nmtoken, no escapes possible
 117:     private static final int    CTX_ENTITY = 1;
 118:     private static final int    CTX_ATTRIBUTE = 2;
 119:     private static final int    CTX_CONTENT = 3;
 120:     private static final int    CTX_UNPARSED = 4;
 121:     private static final int    CTX_NAME = 5;
 122: 
 123: // FIXME: names (element, attribute, PI, notation, etc) are not
 124: // currently written out with range checks (escapeChars).
 125: // In non-XHTML, some names can't be directly written; panic!
 126: 
 127:     private static String    sysEOL;
 128: 
 129:     static {
 130:     try {
 131:         sysEOL = System.getProperty ("line.separator", "\n");
 132: 
 133:         // don't use the system's EOL if it's illegal XML.
 134:         if (!isLineEnd (sysEOL))
 135:         sysEOL = "\n";
 136: 
 137:     } catch (SecurityException e) {
 138:         sysEOL = "\n";
 139:     }
 140:     }
 141: 
 142:     private static boolean isLineEnd (String eol)
 143:     {
 144:     return "\n".equals (eol)
 145:             || "\r".equals (eol)
 146:             || "\r\n".equals (eol);
 147:     }
 148: 
 149:     private Writer        out;
 150:     private boolean        inCDATA;
 151:     private int            elementNestLevel;
 152:     private String        eol = sysEOL;
 153: 
 154:     private short        dangerMask;
 155:     private StringBuffer    stringBuf;
 156:     private Locator        locator;
 157:     private ErrorHandler    errHandler;
 158: 
 159:     private boolean        expandingEntities = false;
 160:     private int            entityNestLevel;
 161:     private boolean        xhtml;
 162:     private boolean        startedDoctype;
 163:     private String        encoding;
 164: 
 165:     private boolean        canonical;
 166:     private boolean        inDoctype;
 167:     private boolean        inEpilogue;
 168: 
 169:     // pretty printing controls
 170:     private boolean        prettyPrinting;
 171:     private int            column;
 172:     private boolean        noWrap;
 173:     private Stack        space = new Stack ();
 174: 
 175:     // this is not a hard'n'fast rule -- longer lines are OK,
 176:     // but are to be avoided.  Here, prettyprinting is more to
 177:     // show structure "cleanly" than to be precise about it.
 178:     // better to have ragged layout than one line 24Kb long.
 179:     private static final int    lineLength = 75;
 180: 
 181: 
 182:     /**
 183:      * Constructs this handler with System.out used to write SAX events
 184:      * using the UTF-8 encoding.  Avoid using this except when you know
 185:      * it's safe to close System.out at the end of the document.
 186:      */
 187:     public XMLWriter () throws IOException
 188:     { this (System.out); }
 189: 
 190:     /**
 191:      * Constructs a handler which writes all input to the output stream
 192:      * in the UTF-8 encoding, and closes it when endDocument is called.
 193:      * (Yes it's annoying that this throws an exception -- but there's
 194:      * really no way around it, since it's barely possible a JDK may
 195:      * exist somewhere that doesn't know how to emit UTF-8.)
 196:      */
 197:     public XMLWriter (OutputStream out) throws IOException
 198:     {
 199:     this (new OutputStreamWriter (out, "UTF8"));
 200:     }
 201: 
 202:     /**
 203:      * Constructs a handler which writes all input to the writer, and then
 204:      * closes the writer when the document ends.  If an XML declaration is
 205:      * written onto the output, and this class can determine the name of
 206:      * the character encoding for this writer, that encoding name will be
 207:      * included in the XML declaration.
 208:      *
 209:      * <P> See the description of the constructor which takes an encoding
 210:      * name for imporant information about selection of encodings.
 211:      *
 212:      * @param writer XML text is written to this writer.
 213:      */
 214:     public XMLWriter (Writer writer)
 215:     {
 216:     this (writer, null);
 217:     }
 218: 
 219:     /**
 220:      * Constructs a handler which writes all input to the writer, and then
 221:      * closes the writer when the document ends.  If an XML declaration is
 222:      * written onto the output, this class will use the specified encoding
 223:      * name in that declaration.  If no encoding name is specified, no
 224:      * encoding name will be declared unless this class can otherwise
 225:      * determine the name of the character encoding for this writer.
 226:      *
 227:      * <P> At this time, only the UTF-8 ("UTF8") and UTF-16 ("Unicode")
 228:      * output encodings are fully lossless with respect to XML data.  If you
 229:      * use any other encoding you risk having your data be silently mangled
 230:      * on output, as the standard Java character encoding subsystem silently
 231:      * maps non-encodable characters to a question mark ("?") and will not
 232:      * report such errors to applications.
 233:      *
 234:      * <p> For a few other encodings the risk can be reduced. If the writer is
 235:      * a java.io.OutputStreamWriter, and uses either the ISO-8859-1 ("8859_1",
 236:      * "ISO8859_1", etc) or US-ASCII ("ASCII") encodings, content which
 237:      * can't be encoded in those encodings will be written safely.  Where
 238:      * relevant, the XHTML entity names will be used; otherwise, numeric
 239:      * character references will be emitted.
 240:      *
 241:      * <P> However, there remain a number of cases where substituting such
 242:      * entity or character references is not an option.  Such references are
 243:      * not usable within a DTD, comment, PI, or CDATA section.  Neither may
 244:      * they be used when element, attribute, entity, or notation names have
 245:      * the problematic characters.
 246:      *
 247:      * @param writer XML text is written to this writer.
 248:      * @param encoding if non-null, and an XML declaration is written,
 249:      *    this is the name that will be used for the character encoding.
 250:      */
 251:     public XMLWriter (Writer writer, String encoding)
 252:     {
 253:     setWriter (writer, encoding);
 254:     }
 255:     
 256:     private void setEncoding (String encoding)
 257:     {
 258:     if (encoding == null && out instanceof OutputStreamWriter)
 259:         encoding = ((OutputStreamWriter)out).getEncoding ();
 260: 
 261:     if (encoding != null) {
 262:         encoding = encoding.toUpperCase ();
 263: 
 264:         // Use official encoding names where we know them,
 265:         // avoiding the Java-only names.  When using common
 266:         // encodings where we can easily tell if characters
 267:         // are out of range, we'll escape out-of-range
 268:         // characters using character refs for safety.
 269: 
 270:         // I _think_ these are all the main synonyms for these!
 271:         if ("UTF8".equals (encoding)) {
 272:         encoding = "UTF-8";
 273:         } else if ("US-ASCII".equals (encoding)
 274:             || "ASCII".equals (encoding)) {
 275:         dangerMask = (short) 0xff80;
 276:         encoding = "US-ASCII";
 277:         } else if ("ISO-8859-1".equals (encoding)
 278:             || "8859_1".equals (encoding)
 279:             || "ISO8859_1".equals (encoding)) {
 280:         dangerMask = (short) 0xff00;
 281:         encoding = "ISO-8859-1";
 282:         } else if ("UNICODE".equals (encoding)
 283:             || "UNICODE-BIG".equals (encoding)
 284:             || "UNICODE-LITTLE".equals (encoding)) {
 285:         encoding = "UTF-16";
 286: 
 287:         // TODO: UTF-16BE, UTF-16LE ... no BOM; what
 288:         // release of JDK supports those Unicode names?
 289:         }
 290: 
 291:         if (dangerMask != 0)
 292:         stringBuf = new StringBuffer ();
 293:     }
 294: 
 295:     this.encoding = encoding;
 296:     }
 297: 
 298: 
 299:     /**
 300:      * Resets the handler to write a new text document.
 301:      *
 302:      * @param writer XML text is written to this writer.
 303:      * @param encoding if non-null, and an XML declaration is written,
 304:      *    this is the name that will be used for the character encoding.
 305:      *
 306:      * @exception IllegalStateException if the current
 307:      *    document hasn't yet ended (with {@link #endDocument})
 308:      */
 309:     final public void setWriter (Writer writer, String encoding)
 310:     {
 311:     if (out != null)
 312:         throw new IllegalStateException (
 313:         "can't change stream in mid course");
 314:     out = writer;
 315:     if (out != null)
 316:         setEncoding (encoding);
 317:     if (!(out instanceof BufferedWriter))
 318:         out = new BufferedWriter (out);
 319:     space.push ("default");
 320:     }
 321: 
 322:     /**
 323:      * Assigns the line ending style to be used on output.
 324:      * @param eolString null to use the system default; else
 325:      *    "\n", "\r", or "\r\n".
 326:      */
 327:     final public void setEOL (String eolString)
 328:     {
 329:     if (eolString == null)
 330:         eol = sysEOL;
 331:     else if (!isLineEnd (eolString))
 332:         eol = eolString;
 333:     else
 334:         throw new IllegalArgumentException (eolString);
 335:     }
 336: 
 337:     /**
 338:      * Assigns the error handler to be used to present most fatal
 339:      * errors.
 340:      */
 341:     public void setErrorHandler (ErrorHandler handler)
 342:     {
 343:     errHandler = handler;
 344:     }
 345: 
 346:     /**
 347:      * Used internally and by subclasses, this encapsulates the logic
 348:      * involved in reporting fatal errors.  It uses locator information
 349:      * for good diagnostics, if available, and gives the application's
 350:      * ErrorHandler the opportunity to handle the error before throwing
 351:      * an exception.
 352:      */
 353:     protected void fatal (String message, Exception e)
 354:     throws SAXException
 355:     {
 356:     SAXParseException    x;
 357: 
 358:     if (locator == null)
 359:         x = new SAXParseException (message, null, null, -1, -1, e);
 360:     else
 361:         x = new SAXParseException (message, locator, e);
 362:     if (errHandler != null)
 363:         errHandler.fatalError (x);
 364:     throw x;
 365:     }
 366: 
 367: 
 368:     // JavaBeans properties
 369: 
 370:     /**
 371:      * Controls whether the output should attempt to follow the "transitional"
 372:      * XHTML rules so that it meets the "HTML Compatibility Guidelines"
 373:      * appendix in the XHTML specification.  A "transitional" Document Type
 374:      * Declaration (DTD) is placed near the beginning of the output document,
 375:      * instead of whatever DTD would otherwise have been placed there, and
 376:      * XHTML empty elements are printed specially.  When writing text in
 377:      * US-ASCII or ISO-8859-1 encodings, the predefined XHTML internal
 378:      * entity names are used (in preference to character references) when
 379:      * writing content characters which can't be expressed in those encodings.
 380:      *
 381:      * <p> When this option is enabled, it is the caller's responsibility
 382:      * to ensure that the input is otherwise valid as XHTML.  Things to
 383:      * be careful of in all cases, as described in the appendix referenced
 384:      * above, include:  <ul>
 385:      *
 386:      *    <li> Element and attribute names must be in lower case, both
 387:      *        in the document and in any CSS style sheet.
 388:      *    <li> All XML constructs must be valid as defined by the XHTML
 389:      *        "transitional" DTD (including all familiar constructs,
 390:      *        even deprecated ones).
 391:      *    <li> The root element must be "html".
 392:      *    <li> Elements that must be empty (such as <em>&lt;br&gt;</em>
 393:      *        must have no content.
 394:      *    <li> Use both <em>lang</em> and <em>xml:lang</em> attributes
 395:      *        when specifying language.
 396:      *    <li> Similarly, use both <em>id</em> and <em>name</em> attributes
 397:      *        when defining elements that may be referred to through
 398:      *        URI fragment identifiers ... and make sure that the
 399:      *        value is a legal NMTOKEN, since not all such HTML 4.0
 400:      *        identifiers are valid in XML.
 401:      *    <li> Be careful with character encodings; make sure you provide
 402:      *        a <em>&lt;meta http-equiv="Content-type"
 403:      *        content="text/xml;charset=..." /&gt;</em> element in
 404:      *        the HTML "head" element, naming the same encoding
 405:      *        used to create this handler.  Also, if that encoding
 406:      *        is anything other than US-ASCII, make sure that if
 407:      *        the document is given a MIME content type, it has
 408:      *        a <em>charset=...</em> attribute with that encoding.
 409:      *    </ul>
 410:      *
 411:      * <p> Additionally, some of the oldest browsers have additional
 412:      * quirks, to address with guidelines such as: <ul>
 413:      *
 414:      *    <li> Processing instructions may be rendered, so avoid them.
 415:      *        (Similarly for an XML declaration.)
 416:      *    <li> Embedded style sheets and scripts should not contain XML
 417:      *        markup delimiters:  &amp;, &lt;, and ]]&gt; are trouble.
 418:      *    <li> Attribute values should not have line breaks or multiple
 419:      *        consecutive white space characters.
 420:      *    <li> Use no more than one of the deprecated (transitional)
 421:      *        <em>&lt;isindex&gt;</em> elements.
 422:      *    <li> Some boolean attributes (such as <em>compact, checked,
 423:      *        disabled, readonly, selected,</em> and more) confuse
 424:      *        some browsers, since they only understand minimized
 425:      *        versions which are illegal in XML.
 426:      *    </ul>
 427:      *
 428:      * <p> Also, some characteristics of the resulting output may be
 429:      * a function of whether the document is later given a MIME
 430:      * content type of <em>text/html</em> rather than one indicating
 431:      * XML (<em>application/xml</em> or <em>text/xml</em>).  Worse,
 432:      * some browsers ignore MIME content types and prefer to rely URI
 433:      * name suffixes -- so an "index.xml" could always be XML, never
 434:      * XHTML, no matter its MIME type.
 435:      */
 436:     final public void setXhtml (boolean value)
 437:     {
 438:     if (locator != null)
 439:         throw new IllegalStateException ("started parsing");
 440:     xhtml = value;
 441:     if (xhtml)
 442:         canonical = false;
 443:     }
 444: 
 445:     /**
 446:      * Returns true if the output attempts to echo the input following
 447:      * "transitional" XHTML rules and matching the "HTML Compatibility
 448:      * Guidelines" so that an HTML version 3 browser can read the output
 449:      * as HTML; returns false (the default) othewise.
 450:      */
 451:     final public boolean isXhtml ()
 452:     {
 453:     return xhtml;
 454:     }
 455: 
 456:     /**
 457:      * Controls whether the output text contains references to
 458:      * entities (the default), or instead contains the expanded
 459:      * values of those entities.
 460:      */
 461:     final public void setExpandingEntities (boolean value)
 462:     {
 463:     if (locator != null)
 464:         throw new IllegalStateException ("started parsing");
 465:     expandingEntities = value;
 466:     if (!expandingEntities)
 467:         canonical = false;
 468:     }
 469: 
 470:     /**
 471:      * Returns true if the output will have no entity references;
 472:      * returns false (the default) otherwise.
 473:      */
 474:     final public boolean isExpandingEntities ()
 475:     {
 476:     return expandingEntities;
 477:     }
 478: 
 479:     /**
 480:      * Controls pretty-printing, which by default is not enabled
 481:      * (and currently is most useful for XHTML output).
 482:      * Pretty printing enables structural indentation, sorting of attributes
 483:      * by name, line wrapping, and potentially other mechanisms for making
 484:      * output more or less readable.
 485:      *
 486:      * <p> At this writing, structural indentation and line wrapping are
 487:      * enabled when pretty printing is enabled and the <em>xml:space</em>
 488:      * attribute has the value <em>default</em> (its other legal value is
 489:      * <em>preserve</em>, as defined in the XML specification).  The three
 490:      * XHTML element types which use another value are recognized by their
 491:      * names (namespaces are ignored).
 492:      *
 493:      * <p> Also, for the record, the "pretty" aspect of printing here
 494:      * is more to provide basic structure on outputs that would otherwise
 495:      * risk being a single long line of text.  For now, expect the
 496:      * structure to be ragged ... unless you'd like to submit a patch
 497:      * to make this be more strictly formatted!
 498:      *
 499:      * @exception IllegalStateException thrown if this method is invoked
 500:      *    after output has begun.
 501:      */
 502:     final public void setPrettyPrinting (boolean value)
 503:     {
 504:     if (locator != null)
 505:         throw new IllegalStateException ("started parsing");
 506:     prettyPrinting = value;
 507:     if (prettyPrinting)
 508:         canonical = false;
 509:     }
 510: 
 511:     /**
 512:      * Returns value of flag controlling pretty printing.
 513:      */
 514:     final public boolean isPrettyPrinting ()
 515:     {
 516:     return prettyPrinting;
 517:     }
 518: 
 519: 
 520:     /**
 521:      * Sets the output style to be canonicalized.  Input events must
 522:      * meet requirements that are slightly more stringent than the
 523:      * basic well-formedness ones, and include:  <ul>
 524:      *
 525:      *    <li> Namespace prefixes must not have been changed from those
 526:      *    in the original document.  (This may only be ensured by setting
 527:      *    the SAX2 XMLReader <em>namespace-prefixes</em> feature flag;
 528:      *    by default, it is cleared.)
 529:      *
 530:      *    <li> Redundant namespace declaration attributes have been
 531:      *    removed.  (If an ancestor element defines a namespace prefix
 532:      *    and that declaration hasn't been overriden, an element must
 533:      *    not redeclare it.)
 534:      *
 535:      *    <li> If comments are not to be included in the canonical output,
 536:      *    they must first be removed from the input event stream; this
 537:      *    <em>Canonical XML with comments</em> by default.
 538:      *
 539:      *    <li> If the input character encoding was not UCS-based, the
 540:      *    character data must have been normalized using Unicode
 541:      *    Normalization Form C.  (UTF-8 and UTF-16 are UCS-based.)
 542:      *
 543:      *    <li> Attribute values must have been normalized, as is done
 544:      *    by any conformant XML processor which processes all external
 545:      *    parameter entities.
 546:      *
 547:      *    <li> Similarly, attribute value defaulting has been performed.
 548:      *
 549:      *    </ul>
 550:      *
 551:      * <p> Note that fragments of XML documents, as specified by an XPath
 552:      * node set, may be canonicalized.  In such cases, elements may need
 553:      * some fixup (for <em>xml:*</em> attributes and application-specific
 554:      * context).
 555:      *
 556:      * @exception IllegalArgumentException if the output encoding
 557:      *    is anything other than UTF-8.
 558:      */
 559:     final public void setCanonical (boolean value)
 560:     {
 561:     if (value && !"UTF-8".equals (encoding))
 562:         throw new IllegalArgumentException ("encoding != UTF-8");
 563:     canonical = value;
 564:     if (canonical) {
 565:         prettyPrinting = xhtml = false;
 566:         expandingEntities = true;
 567:         eol = "\n";
 568:     }
 569:     }
 570: 
 571: 
 572:     /**
 573:      * Returns value of flag controlling canonical output.
 574:      */
 575:     final public boolean isCanonical ()
 576:     {
 577:     return canonical;
 578:     }
 579: 
 580: 
 581:     /**
 582:      * Flushes the output stream.  When this handler is used in long lived
 583:      * pipelines, it can be important to flush buffered state, for example
 584:      * so that it can reach the disk as part of a state checkpoint.
 585:      */
 586:     final public void flush ()
 587:     throws IOException
 588:     {
 589:     if (out != null)
 590:         out.flush ();
 591:     }
 592: 
 593: 
 594:     // convenience routines
 595: 
 596: // FIXME:  probably want a subclass that holds a lot of these...
 597: // and maybe more!
 598:     
 599:     /**
 600:      * Writes the string as if characters() had been called on the contents
 601:      * of the string.  This is particularly useful when applications act as
 602:      * producers and write data directly to event consumers.
 603:      */
 604:     final public void write (String data)
 605:     throws SAXException
 606:     {
 607:     char    buf [] = data.toCharArray ();
 608:     characters (buf, 0, buf.length);
 609:     }
 610: 
 611: 
 612:     /**
 613:      * Writes an element that has content consisting of a single string.
 614:      * @see #writeEmptyElement
 615:      * @see #startElement
 616:      */
 617:     public void writeElement (
 618:     String uri,
 619:     String localName,
 620:     String qName,
 621:     Attributes atts,
 622:     String content
 623:     ) throws SAXException
 624:     {
 625:     if (content == null || content.length () == 0) {
 626:         writeEmptyElement (uri, localName, qName, atts);
 627:         return;
 628:     }
 629:     startElement (uri, localName, qName, atts);
 630:     char chars [] = content.toCharArray ();
 631:     characters (chars, 0, chars.length);
 632:     endElement (uri, localName, qName);
 633:     }
 634: 
 635: 
 636:     /**
 637:      * Writes an element that has content consisting of a single integer,
 638:      * encoded as a decimal string.
 639:      * @see #writeEmptyElement
 640:      * @see #startElement
 641:      */
 642:     public void writeElement (
 643:     String uri,
 644:     String localName,
 645:     String qName,
 646:     Attributes atts,
 647:     int content
 648:     ) throws SAXException
 649:     {
 650:     writeElement (uri, localName, qName, atts, Integer.toString (content));
 651:     }
 652: 
 653: 
 654:     // SAX1 ContentHandler
 655:     /** <b>SAX1</b>:  provides parser status information */
 656:     final public void setDocumentLocator (Locator l)
 657:     {
 658:     locator = l;
 659:     }
 660: 
 661: 
 662:     // URL for dtd that validates against all normal HTML constructs
 663:     private static final String xhtmlFullDTD =
 664:     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
 665: 
 666:     
 667:     /**
 668:      * <b>SAX1</b>:  indicates the beginning of a document parse.
 669:      * If you're writing (well formed) fragments of XML, neither
 670:      * this nor endDocument should be called.
 671:      */
 672:     // NOT final
 673:     public void startDocument ()
 674:     throws SAXException
 675:     {
 676:     try {
 677:         if (out == null)
 678:         throw new IllegalStateException (
 679:             "null Writer given to XMLWriter");
 680: 
 681:         // Not all parsers provide the locator we want; this also
 682:         // flags whether events are being sent to this object yet.
 683:         // We could only have this one call if we only printed whole
 684:         // documents ... but we also print fragments, so most of the
 685:         // callbacks here replicate this test.
 686: 
 687:         if (locator == null)
 688:         locator = new LocatorImpl ();
 689:         
 690:         // Unless the data is in US-ASCII or we're canonicalizing, write
 691:         // the XML declaration if we know the encoding.  US-ASCII won't
 692:         // normally get mangled by web server confusion about the
 693:         // character encodings used.  Plus, it's an easy way to
 694:         // ensure we can write ASCII that's unlikely to confuse
 695:         // elderly HTML parsers.
 696: 
 697:         if (!canonical
 698:             && dangerMask != (short) 0xff80
 699:             && encoding != null) {
 700:         rawWrite ("<?xml version='1.0'");
 701:         rawWrite (" encoding='" + encoding + "'");
 702:         rawWrite ("?>");
 703:         newline ();
 704:         }
 705: 
 706:         if (xhtml) {
 707: 
 708:         rawWrite ("<!DOCTYPE html PUBLIC");
 709:         newline ();
 710:         rawWrite ("  '-//W3C//DTD XHTML 1.0 Transitional//EN'");
 711:         newline ();
 712:         rawWrite ("  '");
 713:             // NOTE:  URL (above) matches the REC
 714:         rawWrite (xhtmlFullDTD);
 715:         rawWrite ("'>");
 716:         newline ();
 717:         newline ();
 718: 
 719:         // fake the rest of the handler into ignoring
 720:         // everything until the root element, so any
 721:         // XHTML DTD comments, PIs, etc are ignored
 722:         startedDoctype = true;
 723:         }
 724: 
 725:         entityNestLevel = 0;
 726: 
 727:     } catch (IOException e) {
 728:         fatal ("can't write", e);
 729:     }
 730:     }
 731: 
 732:     /**
 733:      * <b>SAX1</b>:  indicates the completion of a parse.
 734:      * Note that all complete SAX event streams make this call, even
 735:      * if an error is reported during a parse.
 736:      */
 737:     // NOT final
 738:     public void endDocument ()
 739:     throws SAXException
 740:     {
 741:     try {
 742:         if (!canonical) {
 743:         newline ();
 744:         newline ();
 745:         }
 746:         out.close ();
 747:         out = null;
 748:         locator = null;
 749:     } catch (IOException e) {
 750:         fatal ("can't write", e);
 751:     }
 752:     }
 753: 
 754:     // XHTML elements declared as EMPTY print differently
 755:     final private static boolean isEmptyElementTag (String tag)
 756:     {
 757:     switch (tag.charAt (0)) {
 758:       case 'a':    return "area".equals (tag);
 759:       case 'b':    return "base".equals (tag)
 760:                 || "basefont".equals (tag)
 761:                 || "br".equals (tag);
 762:       case 'c':    return "col".equals (tag);
 763:       case 'f':    return "frame".equals (tag);
 764:       case 'h':    return "hr".equals (tag);
 765:       case 'i':    return "img".equals (tag)
 766:                 || "input".equals (tag)
 767:                 || "isindex".equals (tag);
 768:       case 'l':    return "link".equals (tag);
 769:       case 'm':    return "meta".equals (tag);
 770:       case 'p':    return "param".equals (tag);
 771:     }
 772:     return false;
 773:     }
 774: 
 775:     private static boolean indentBefore (String tag)
 776:     {
 777:     // basically indent before block content
 778:     // and within structure like tables, lists
 779:     switch (tag.charAt (0)) {
 780:       case 'a':    return "applet".equals (tag);
 781:       case 'b':    return "body".equals (tag)
 782:                 || "blockquote".equals (tag);
 783:       case 'c':    return "center".equals (tag);
 784:       case 'f':    return "frame".equals (tag)
 785:                 || "frameset".equals (tag);
 786:       case 'h':    return "head".equals (tag);
 787:       case 'm':    return "meta".equals (tag);
 788:       case 'o':    return "object".equals (tag);
 789:       case 'p':    return "param".equals (tag)
 790:                 || "pre".equals (tag);
 791:       case 's':    return "style".equals (tag);
 792:       case 't':    return "title".equals (tag)
 793:                 || "td".equals (tag)
 794:                 || "th".equals (tag);
 795:     }
 796:     // ... but not inline elements like "em", "b", "font"
 797:     return false;
 798:     }
 799: 
 800:     private static boolean spaceBefore (String tag)
 801:     {
 802:     // blank line AND INDENT before certain structural content
 803:     switch (tag.charAt (0)) {
 804:       case 'h':    return "h1".equals (tag)
 805:                 || "h2".equals (tag)
 806:                 || "h3".equals (tag)
 807:                 || "h4".equals (tag)
 808:                 || "h5".equals (tag)
 809:                 || "h6".equals (tag)
 810:                 || "hr".equals (tag);
 811:       case 'l':    return "li".equals (tag);
 812:       case 'o':    return "ol".equals (tag);
 813:       case 'p':    return "p".equals (tag);
 814:       case 't':    return "table".equals (tag)
 815:                 || "tr".equals (tag);
 816:       case 'u':    return "ul".equals (tag);
 817:     }
 818:     return false;
 819:     }
 820: 
 821:     // XHTML DTDs say these three have xml:space="preserve"
 822:     private static boolean spacePreserve (String tag)
 823:     {
 824:     return "pre".equals (tag)
 825:         || "style".equals (tag)
 826:         || "script".equals (tag);
 827:     }
 828: 
 829:     /**
 830:      * <b>SAX2</b>:  ignored.
 831:      */
 832:     final public void startPrefixMapping (String prefix, String uri)
 833:     {}
 834: 
 835:     /**
 836:      * <b>SAX2</b>:  ignored.
 837:      */
 838:     final public void endPrefixMapping (String prefix)
 839:     {}
 840: 
 841:     private void writeStartTag (
 842:     String name,
 843:     Attributes atts,
 844:     boolean isEmpty
 845:     ) throws SAXException, IOException
 846:     {
 847:     rawWrite ('<');
 848:     rawWrite (name);
 849: 
 850:     // write out attributes ... sorting is particularly useful
 851:     // with output that's been heavily defaulted.
 852:     if (atts != null && atts.getLength () != 0) {
 853: 
 854:         // Set up to write, with optional sorting
 855:         int     indices [] = new int [atts.getLength ()];
 856: 
 857:         for (int i= 0; i < indices.length; i++)
 858:         indices [i] = i;
 859:         
 860:         // optionally sort
 861: 
 862: // FIXME:  canon xml demands xmlns nodes go first,
 863: // and sorting by URI first (empty first) then localname
 864: // it should maybe use a different sort
 865: 
 866:         if (canonical || prettyPrinting) {
 867: 
 868:         // insertion sort by attribute name
 869:         for (int i = 1; i < indices.length; i++) {
 870:             int    n = indices [i], j;
 871:             String    s = atts.getQName (n);
 872: 
 873:             for (j = i - 1; j >= 0; j--) {
 874:             if (s.compareTo (atts.getQName (indices [j]))
 875:                 >= 0)
 876:                 break;
 877:             indices [j + 1] = indices [j];
 878:             }
 879:             indices [j + 1] = n;
 880:         }
 881:         }
 882: 
 883:         // write, sorted or no
 884:         for (int i= 0; i < indices.length; i++) {
 885:         String    s = atts.getQName (indices [i]);
 886: 
 887:             if (s == null || "".equals (s))
 888:             throw new IllegalArgumentException ("no XML name");
 889:         rawWrite (" ");
 890:         rawWrite (s);
 891:         rawWrite ("=");
 892:         writeQuotedValue (atts.getValue (indices [i]),
 893:             CTX_ATTRIBUTE);
 894:         }
 895:     }
 896:     if (isEmpty)
 897:         rawWrite (" /");
 898:     rawWrite ('>');
 899:     }
 900: 
 901:     /**
 902:      * <b>SAX2</b>:  indicates the start of an element.
 903:      * When XHTML is in use, avoid attribute values with
 904:      * line breaks or multiple whitespace characters, since
 905:      * not all user agents handle them correctly.
 906:      */
 907:     final public void startElement (
 908:     String uri,
 909:     String localName,
 910:     String qName,
 911:     Attributes atts
 912:     ) throws SAXException
 913:     {
 914:     startedDoctype = false;
 915: 
 916:     if (locator == null)
 917:         locator = new LocatorImpl ();
 918:         
 919:     if (qName == null || "".equals (qName))
 920:         throw new IllegalArgumentException ("no XML name");
 921: 
 922:     try {
 923:         if (entityNestLevel != 0)
 924:         return;
 925:         if (prettyPrinting) {
 926:         String whitespace = null;
 927: 
 928:         if (xhtml && spacePreserve (qName))
 929:             whitespace = "preserve";
 930:         else if (atts != null)
 931:             whitespace = atts.getValue ("xml:space");
 932:         if (whitespace == null)
 933:             whitespace = (String) space.peek ();
 934:         space.push (whitespace);
 935: 
 936:         if ("default".equals (whitespace)) {
 937:             if (xhtml) {
 938:             if (spaceBefore (qName)) {
 939:                 newline ();
 940:                 doIndent ();
 941:             } else if (indentBefore (qName))
 942:                 doIndent ();
 943:             // else it's inlined, modulo line length
 944:             // FIXME: incrementing element nest level
 945:             // for inlined elements causes ugliness
 946:             } else
 947:             doIndent ();
 948:         }
 949:         }
 950:         elementNestLevel++;
 951:         writeStartTag (qName, atts, xhtml && isEmptyElementTag (qName));
 952: 
 953:         if (xhtml) {
 954: // FIXME: if this is an XHTML "pre" element, turn
 955: // off automatic wrapping.
 956:         }
 957: 
 958:     } catch (IOException e) {
 959:         fatal ("can't write", e);
 960:     }
 961:     }
 962: 
 963:     /**
 964:      * Writes an empty element.
 965:      * @see #startElement
 966:      */
 967:     public void writeEmptyElement (
 968:     String uri,
 969:     String localName,
 970:     String qName,
 971:     Attributes atts
 972:     ) throws SAXException
 973:     {
 974:     if (canonical) {
 975:         startElement (uri, localName, qName, atts);
 976:         endElement (uri, localName, qName);
 977:     } else {
 978:         try {
 979:         writeStartTag (qName, atts, true);
 980:         } catch (IOException e) {
 981:         fatal ("can't write", e);
 982:         }
 983:     }
 984:     }
 985: 
 986: 
 987:     /** <b>SAX2</b>:  indicates the end of an element */
 988:     final public void endElement (String uri, String localName, String qName)
 989:     throws SAXException
 990:     {
 991:     if (qName == null || "".equals (qName))
 992:         throw new IllegalArgumentException ("no XML name");
 993: 
 994:     try {
 995:         elementNestLevel--;
 996:         if (entityNestLevel != 0)
 997:         return;
 998:         if (xhtml && isEmptyElementTag (qName))
 999:         return;
1000:         rawWrite ("</");
1001:         rawWrite (qName);
1002:         rawWrite ('>');
1003: 
1004:         if (prettyPrinting) {
1005:         if (!space.empty ())
1006:             space.pop ();
1007:         else
1008:             fatal ("stack discipline", null);
1009:         }
1010:         if (elementNestLevel == 0)
1011:         inEpilogue = true;
1012: 
1013:     } catch (IOException e) {
1014:         fatal ("can't write", e);
1015:     }
1016:     }
1017: 
1018:     /** <b>SAX1</b>:  reports content characters */
1019:     final public void characters (char ch [], int start, int length)
1020:     throws SAXException
1021:     {
1022:     if (locator == null)
1023:         locator = new LocatorImpl ();
1024: 
1025:     try {
1026:         if (entityNestLevel != 0)
1027:         return;
1028:         if (inCDATA) {
1029:         escapeChars (ch, start, length, CTX_UNPARSED);
1030:         } else {
1031:         escapeChars (ch, start, length, CTX_CONTENT);
1032:         }
1033:     } catch (IOException e) {
1034:         fatal ("can't write", e);
1035:     }
1036:     }
1037: 
1038:     /** <b>SAX1</b>:  reports ignorable whitespace */
1039:     final public void ignorableWhitespace (char ch [], int start, int length)
1040:     throws SAXException
1041:     {
1042:     if (locator == null)
1043:         locator = new LocatorImpl ();
1044: 
1045:     try {
1046:         if (entityNestLevel != 0)
1047:         return;
1048:         // don't forget to map NL to CRLF, CR, etc
1049:         escapeChars (ch, start, length, CTX_CONTENT);
1050:     } catch (IOException e) {
1051:         fatal ("can't write", e);
1052:     }
1053:     }
1054: 
1055:     /**
1056:      * <b>SAX1</b>:  reports a PI.
1057:      * This doesn't check for illegal target names, such as "xml" or "XML",
1058:      * or namespace-incompatible ones like "big:dog"; the caller is
1059:      * responsible for ensuring those names are legal.
1060:      */
1061:     final public void processingInstruction (String target, String data)
1062:     throws SAXException
1063:     {
1064:     if (locator == null)
1065:         locator = new LocatorImpl ();
1066: 
1067:     // don't print internal subset for XHTML
1068:     if (xhtml && startedDoctype)
1069:         return;
1070: 
1071:     // ancient HTML browsers might render these ... their loss.
1072:     // to prevent:  "if (xhtml) return;".
1073: 
1074:     try {
1075:         if (entityNestLevel != 0)
1076:         return;
1077:         if (canonical && inEpilogue)
1078:         newline ();
1079:         rawWrite ("<?");
1080:         rawWrite (target);
1081:         rawWrite (' ');
1082:         escapeChars (data.toCharArray (), -1, -1, CTX_UNPARSED);
1083:         rawWrite ("?>");
1084:         if (elementNestLevel == 0 && !(canonical && inEpilogue))
1085:         newline ();
1086:     } catch (IOException e) {
1087:         fatal ("can't write", e);
1088:     }
1089:     }
1090: 
1091:     /** <b>SAX1</b>: indicates a non-expanded entity reference */
1092:     public void skippedEntity (String name)
1093:     throws SAXException
1094:     {
1095:     try {
1096:         rawWrite ("&");
1097:         rawWrite (name);
1098:         rawWrite (";");
1099:     } catch (IOException e) {
1100:         fatal ("can't write", e);
1101:     }
1102:     }
1103: 
1104:     // SAX2 LexicalHandler
1105: 
1106:     /** <b>SAX2</b>:  called before parsing CDATA characters */
1107:     final public void startCDATA ()
1108:     throws SAXException
1109:     {
1110:     if (locator == null)
1111:         locator = new LocatorImpl ();
1112:     
1113:     if (canonical)
1114:         return;
1115: 
1116:     try {
1117:         inCDATA = true;
1118:         if (entityNestLevel == 0)
1119:         rawWrite ("<![CDATA[");
1120:     } catch (IOException e) {
1121:         fatal ("can't write", e);
1122:     }
1123:     }
1124: 
1125:     /** <b>SAX2</b>:  called after parsing CDATA characters */
1126:     final public void endCDATA ()
1127:     throws SAXException
1128:     {
1129:     if (canonical)
1130:         return;
1131: 
1132:     try {
1133:         inCDATA = false;
1134:         if (entityNestLevel == 0)
1135:         rawWrite ("]]>");
1136:     } catch (IOException e) {
1137:         fatal ("can't write", e);
1138:     }
1139:     }
1140: 
1141:     /**
1142:      * <b>SAX2</b>:  called when the doctype is partially parsed
1143:      * Note that this, like other doctype related calls, is ignored
1144:      * when XHTML is in use.
1145:      */
1146:     final public void startDTD (String name, String publicId, String systemId)
1147:     throws SAXException
1148:     {
1149:     if (locator == null)
1150:         locator = new LocatorImpl ();
1151:     if (xhtml)
1152:         return;
1153:     try {
1154:         inDoctype = startedDoctype = true;
1155:         if (canonical)
1156:         return;
1157:         rawWrite ("<!DOCTYPE ");
1158:         rawWrite (name);
1159:         rawWrite (' ');
1160: 
1161:         if (!expandingEntities) {
1162:         if (publicId != null)
1163:             rawWrite ("PUBLIC '" + publicId + "' '" + systemId + "' ");
1164:         else if (systemId != null)
1165:             rawWrite ("SYSTEM '" + systemId + "' ");
1166:         }
1167: 
1168:         rawWrite ('[');
1169:         newline ();
1170:     } catch (IOException e) {
1171:         fatal ("can't write", e);
1172:     }
1173:     }
1174: 
1175:     /** <b>SAX2</b>:  called after the doctype is parsed */
1176:     final public void endDTD ()
1177:     throws SAXException
1178:     {
1179:     inDoctype = false;
1180:     if (canonical || xhtml)
1181:         return;
1182:     try {
1183:         rawWrite ("]>");
1184:         newline ();
1185:     } catch (IOException e) {
1186:         fatal ("can't write", e);
1187:     }
1188:     }
1189: 
1190:     /**
1191:      * <b>SAX2</b>:  called before parsing a general entity in content
1192:      */
1193:     final public void startEntity (String name)
1194:     throws SAXException
1195:     {
1196:     try {
1197:         boolean    writeEOL = true;
1198: 
1199:         // Predefined XHTML entities (for characters) will get
1200:         // mapped back later.
1201:         if (xhtml || expandingEntities)
1202:         return;
1203: 
1204:         entityNestLevel++;
1205:         if (name.equals ("[dtd]"))
1206:         return;
1207:         if (entityNestLevel != 1)
1208:         return;
1209:         if (!name.startsWith ("%")) {
1210:         writeEOL = false;
1211:         rawWrite ('&');
1212:         }
1213:         rawWrite (name);
1214:         rawWrite (';');
1215:         if (writeEOL)
1216:         newline ();
1217:     } catch (IOException e) {
1218:         fatal ("can't write", e);
1219:     }
1220:     }
1221: 
1222:     /**
1223:      * <b>SAX2</b>:  called after parsing a general entity in content
1224:      */
1225:     final public void endEntity (String name)
1226:     throws SAXException
1227:     {
1228:     if (xhtml || expandingEntities)
1229:         return;
1230:     entityNestLevel--;
1231:     }
1232: 
1233:     /**
1234:      * <b>SAX2</b>:  called when comments are parsed.
1235:      * When XHTML is used, the old HTML tradition of using comments
1236:      * to for inline CSS, or for JavaScript code is  discouraged.
1237:      * This is because XML processors are encouraged to discard, on
1238:      * the grounds that comments are for users (and perhaps text
1239:      * editors) not programs.  Instead, use external scripts
1240:      */
1241:     final public void comment (char ch [], int start, int length)
1242:     throws SAXException
1243:     {
1244:     if (locator == null)
1245:         locator = new LocatorImpl ();
1246: 
1247:     // don't print internal subset for XHTML
1248:     if (xhtml && startedDoctype)
1249:         return;
1250:     // don't print comment in doctype for canon xml
1251:     if (canonical && inDoctype)
1252:         return;
1253: 
1254:     try {
1255:         boolean indent;
1256: 
1257:         if (prettyPrinting && space.empty ())
1258:         fatal ("stack discipline", null);
1259:         indent = prettyPrinting && "default".equals (space.peek ());
1260:         if (entityNestLevel != 0)
1261:         return;
1262:         if (indent)
1263:         doIndent ();
1264:         if (canonical && inEpilogue)
1265:         newline ();
1266:         rawWrite ("<!--");
1267:         escapeChars (ch, start, length, CTX_UNPARSED);
1268:         rawWrite ("-->");
1269:         if (indent)
1270:         doIndent ();
1271:         if (elementNestLevel == 0 && !(canonical && inEpilogue))
1272:         newline ();
1273:     } catch (IOException e) {
1274:         fatal ("can't write", e);
1275:     }
1276:     }
1277: 
1278:     // SAX1 DTDHandler
1279: 
1280:     /** <b>SAX1</b>:  called on notation declarations */
1281:     final public void notationDecl (String name,
1282:         String publicId, String systemId)
1283:     throws SAXException
1284:     {
1285:     if (xhtml)
1286:         return;
1287:     try {
1288:         // At this time, only SAX2 callbacks start these.
1289:         if (!startedDoctype)
1290:         return;
1291: 
1292:         if (entityNestLevel != 0)
1293:         return;
1294:         rawWrite ("<!NOTATION " + name + " ");
1295:         if (publicId != null)
1296:         rawWrite ("PUBLIC \"" + publicId + '"');
1297:         else
1298:         rawWrite ("SYSTEM ");
1299:         if (systemId != null)
1300:         rawWrite ('"' + systemId + '"');
1301:         rawWrite (">");
1302:         newline ();
1303:     } catch (IOException e) {
1304:         fatal ("can't write", e);
1305:     }
1306:     }
1307: 
1308:     /** <b>SAX1</b>:  called on unparsed entity declarations */
1309:     final public void unparsedEntityDecl (String name,
1310:     String publicId, String systemId,
1311:     String notationName)
1312:     throws SAXException
1313:     {
1314:     if (xhtml)
1315:         return;
1316:     try {
1317:         // At this time, only SAX2 callbacks start these.
1318:         if (!startedDoctype)  {
1319:         // FIXME: write to temporary buffer, and make the start
1320:         // of the root element write these declarations.
1321:         return;
1322:         }
1323: 
1324:         if (entityNestLevel != 0)
1325:         return;
1326:         rawWrite ("<!ENTITY " + name + " ");
1327:         if (publicId != null)
1328:         rawWrite ("PUBLIC \"" + publicId + '"');
1329:         else
1330:         rawWrite ("SYSTEM ");
1331:         rawWrite ('"' + systemId + '"');
1332:         rawWrite (" NDATA " + notationName + ">");
1333:         newline ();
1334:     } catch (IOException e) {
1335:         fatal ("can't write", e);
1336:     }
1337:     }
1338: 
1339:     // SAX2 DeclHandler
1340: 
1341:     /** <b>SAX2</b>:  called on attribute declarations */
1342:     final public void attributeDecl (String eName, String aName,
1343:         String type, String mode, String value)
1344:     throws SAXException
1345:     {
1346:     if (xhtml)
1347:         return;
1348:     try {
1349:         // At this time, only SAX2 callbacks start these.
1350:         if (!startedDoctype)
1351:         return;
1352:         if (entityNestLevel != 0)
1353:         return;
1354:         rawWrite ("<!ATTLIST " + eName + ' ' + aName + ' ');
1355:         rawWrite (type);
1356:         rawWrite (' ');
1357:         if (mode != null)
1358:         rawWrite (mode + ' ');
1359:         if (value != null) 
1360:         writeQuotedValue (value, CTX_ATTRIBUTE);
1361:         rawWrite ('>');
1362:         newline ();
1363:     } catch (IOException e) {
1364:         fatal ("can't write", e);
1365:     }
1366:     }
1367: 
1368:     /** <b>SAX2</b>:  called on element declarations */
1369:     final public void elementDecl (String name, String model)
1370:     throws SAXException
1371:     {
1372:     if (xhtml)
1373:         return;
1374:     try {
1375:         // At this time, only SAX2 callbacks start these.
1376:         if (!startedDoctype)
1377:         return;
1378:         if (entityNestLevel != 0)
1379:         return;
1380:         rawWrite ("<!ELEMENT " + name + ' ' + model + '>');
1381:         newline ();
1382:     } catch (IOException e) {
1383:         fatal ("can't write", e);
1384:     }
1385:     }
1386: 
1387:     /** <b>SAX2</b>:  called on external entity declarations */
1388:     final public void externalEntityDecl (
1389:     String name,
1390:     String publicId,
1391:     String systemId)
1392:     throws SAXException
1393:     {
1394:     if (xhtml)
1395:         return;
1396:     try {
1397:         // At this time, only SAX2 callbacks start these.
1398:         if (!startedDoctype)
1399:         return;
1400:         if (entityNestLevel != 0)
1401:         return;
1402:         rawWrite ("<!ENTITY ");
1403:         if (name.startsWith ("%")) {
1404:         rawWrite ("% ");
1405:         rawWrite (name.substring (1));
1406:         } else
1407:         rawWrite (name);
1408:         if (publicId != null)
1409:         rawWrite (" PUBLIC \"" + publicId + '"');
1410:         else
1411:         rawWrite (" SYSTEM ");
1412:         rawWrite ('"' + systemId + "\">");
1413:         newline ();
1414:     } catch (IOException e) {
1415:         fatal ("can't write", e);
1416:     }
1417:     }
1418: 
1419:     /** <b>SAX2</b>:  called on internal entity declarations */
1420:     final public void internalEntityDecl (String name, String value)
1421:     throws SAXException
1422:     {
1423:     if (xhtml)
1424:         return;
1425:     try {
1426:         // At this time, only SAX2 callbacks start these.
1427:         if (!startedDoctype)
1428:         return;
1429:         if (entityNestLevel != 0)
1430:         return;
1431:         rawWrite ("<!ENTITY ");
1432:         if (name.startsWith ("%")) {
1433:         rawWrite ("% ");
1434:         rawWrite (name.substring (1));
1435:         } else
1436:         rawWrite (name);
1437:         rawWrite (' ');
1438:         writeQuotedValue (value, CTX_ENTITY);
1439:         rawWrite ('>');
1440:         newline ();
1441:     } catch (IOException e) {
1442:         fatal ("can't write", e);
1443:     }
1444:     }
1445: 
1446:     private void writeQuotedValue (String value, int code)
1447:     throws SAXException, IOException
1448:     {
1449:     char    buf [] = value.toCharArray ();
1450:     int    off = 0, len = buf.length;
1451: 
1452:     // we can't add line breaks to attribute/entity/... values
1453:     noWrap = true;
1454:     rawWrite ('"');
1455:     escapeChars (buf, off, len, code);
1456:     rawWrite ('"');
1457:     noWrap = false;
1458:     }
1459:     
1460:     // From "HTMLlat1x.ent" ... names of entities for ISO-8859-1
1461:     // (Latin/1) characters, all codes:  160-255 (0xA0-0xFF).
1462:     // Codes 128-159 have no assigned values.
1463:     private static final String HTMLlat1x [] = {
1464:     // 160
1465:     "nbsp", "iexcl", "cent", "pound", "curren",
1466:     "yen", "brvbar", "sect", "uml", "copy",
1467: 
1468:     // 170
1469:     "ordf", "laquo", "not", "shy", "reg",
1470:     "macr", "deg", "plusmn", "sup2", "sup3",
1471: 
1472:     // 180
1473:     "acute", "micro", "para", "middot", "cedil",
1474:     "sup1", "ordm", "raquo", "frac14", "frac12",
1475: 
1476:     // 190
1477:     "frac34", "iquest", "Agrave", "Aacute", "Acirc",
1478:     "Atilde", "Auml", "Aring", "AElig", "Ccedil",
1479: 
1480:     // 200
1481:     "Egrave", "Eacute", "Ecirc", "Euml", "Igrave",
1482:     "Iacute", "Icirc", "Iuml", "ETH", "Ntilde",
1483: 
1484:     // 210
1485:     "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml",
1486:     "times", "Oslash", "Ugrave", "Uacute", "Ucirc",
1487: 
1488:     // 220
1489:     "Uuml", "Yacute", "THORN", "szlig", "agrave",
1490:     "aacute", "acirc", "atilde", "auml", "aring",
1491: 
1492:     // 230
1493:     "aelig", "ccedil", "egrave", "eacute", "ecirc",
1494:     "euml", "igrave", "iacute", "icirc", "iuml",
1495: 
1496:     // 240
1497:     "eth", "ntilde", "ograve", "oacute", "ocirc",
1498:     "otilde", "ouml", "divide", "oslash", "ugrave",
1499: 
1500:     // 250
1501:     "uacute", "ucirc", "uuml", "yacute", "thorn",
1502:     "yuml"
1503:     };
1504: 
1505:     // From "HTMLsymbolx.ent" ... some of the symbols that
1506:     // we can conveniently handle.  Entities for the Greek.
1507:     // alphabet (upper and lower cases) are compact.
1508:     private static final String HTMLsymbolx_GR [] = {
1509:     // 913
1510:     "Alpha", "Beta", "Gamma", "Delta", "Epsilon",
1511:     "Zeta", "Eta", "Theta", "Iota", "Kappa",
1512: 
1513:     // 923
1514:     "Lambda", "Mu", "Nu", "Xi", "Omicron",
1515:     "Pi", "Rho", null, "Sigma", "Tau",
1516: 
1517:     // 933
1518:     "Upsilon", "Phi", "Chi", "Psi", "Omega"
1519:     };
1520: 
1521:     private static final String HTMLsymbolx_gr [] = {
1522:     // 945
1523:     "alpha", "beta", "gamma", "delta", "epsilon",
1524:     "zeta", "eta", "theta", "iota", "kappa",
1525: 
1526:     // 955
1527:     "lambda", "mu", "nu", "xi", "omicron",
1528:     "pi", "rho", "sigmaf", "sigma", "tau",
1529: 
1530:     // 965
1531:     "upsilon", "phi", "chi", "psi", "omega"
1532:     };
1533: 
1534: 
1535:     // General routine to write text and substitute predefined
1536:     // entities (XML, and a special case for XHTML) as needed.
1537:     private void escapeChars (char buf [], int off, int len, int code)
1538:     throws SAXException, IOException
1539:     {
1540:     int    first = 0;
1541: 
1542:     if (off < 0) {
1543:         off = 0;
1544:         len = buf.length;
1545:     }
1546:     for (int i = 0; i < len; i++) {
1547:         String    esc;
1548:         char     c = buf [off + i];
1549: 
1550:         switch (c) {
1551:           // Note that CTX_ATTRIBUTE isn't explicitly tested here;
1552:           // all syntax delimiters are escaped in CTX_ATTRIBUTE,
1553:           // otherwise it's similar to CTX_CONTENT
1554: 
1555:           // ampersand flags entity references; entity replacement
1556:           // text has unexpanded references, other text doesn't.
1557:           case '&':
1558:         if (code == CTX_ENTITY || code == CTX_UNPARSED)
1559:             continue;
1560:         esc = "amp";
1561:         break;
1562: 
1563:           // attributes and text may NOT have literal '<', but
1564:           // entities may have markup constructs
1565:           case '<':
1566:         if (code == CTX_ENTITY || code == CTX_UNPARSED)
1567:             continue;
1568:         esc = "lt";
1569:         break;
1570: 
1571:           // as above re markup constructs; but otherwise
1572:           // except when canonicalizing, this is for consistency
1573:           case '>':
1574:         if (code == CTX_ENTITY || code == CTX_UNPARSED)
1575:             continue;
1576:             esc = "gt";
1577:         break;
1578:           case '\'':
1579:         if (code == CTX_CONTENT || code == CTX_UNPARSED)
1580:             continue;
1581:         if (canonical)
1582:             continue;
1583:         esc = "apos";
1584:         break;
1585: 
1586:           // needed when printing quoted attribute/entity values
1587:           case '"':
1588:         if (code == CTX_CONTENT || code == CTX_UNPARSED)
1589:             continue;
1590:         esc = "quot";
1591:         break;
1592: 
1593:           // make line ends work per host OS convention
1594:           case '\n':
1595:         esc = eol;
1596:         break;
1597: 
1598:           //
1599:           // No other characters NEED special treatment ... except
1600:           // for encoding-specific issues, like whether the character
1601:           // can really be represented in that encoding.
1602:           //
1603:           default:
1604:         //
1605:         // There are characters we can never write safely; getting
1606:         // them is an error.
1607:         //
1608:         //   (a) They're never legal in XML ... detected by range 
1609:         //    checks, and (eventually) by remerging surrogate
1610:         //    pairs on output.  (Easy error for apps to prevent.)
1611:         //
1612:         //   (b) This encoding can't represent them, and we
1613:         //    can't make reference substitution (e.g. inside
1614:         //    CDATA sections, names, PI data, etc).  (Hard for
1615:         //    apps to prevent, except by using UTF-8 or UTF-16
1616:         //    as their output encoding.)
1617:         //
1618:         // We know a very little bit about what characters
1619:         // the US-ASCII and ISO-8859-1 encodings support.  For
1620:         // other encodings we can't detect the second type of
1621:         // error at all.  (Never an issue for UTF-8 or UTF-16.)
1622:         //
1623: 
1624: // FIXME:  CR in CDATA is an error; in text, turn to a char ref
1625: 
1626: // FIXME:  CR/LF/TAB in attributes should become char refs
1627: 
1628:         if ((c > 0xfffd)
1629:             || ((c < 0x0020) && !((c == 0x0009)
1630:                 || (c == 0x000A) || (c == 0x000D)))
1631:             || (((c & dangerMask) != 0)
1632:                 && (code == CTX_UNPARSED))) {
1633: 
1634:             // if case (b) in CDATA, we might end the section,
1635:             // write a reference, then restart ... possible
1636:             // in one DOM L3 draft.
1637: 
1638:             throw new CharConversionException (
1639:                 "Illegal or non-writable character: U+"
1640:                 + Integer.toHexString (c));
1641:         }
1642: 
1643:         //
1644:         // If the output encoding represents the character
1645:         // directly, let it do so!  Else we'll escape it.
1646:         //
1647:         if ((c & dangerMask) == 0)
1648:             continue;
1649:         esc = null;
1650: 
1651:         // Avoid numeric refs where symbolic ones exist, as
1652:         // symbolic ones make more sense to humans reading!
1653:         if (xhtml) {
1654:             // all the HTMLlat1x.ent entities
1655:             // (all the "ISO-8859-1" characters)
1656:             if (c >= 160 && c <= 255)
1657:             esc = HTMLlat1x [c - 160];
1658: 
1659:             // not quite half the HTMLsymbolx.ent entities
1660:             else if (c >= 913 && c <= 937)
1661:             esc = HTMLsymbolx_GR [c - 913];
1662:             else if (c >= 945 && c <= 969)
1663:             esc = HTMLsymbolx_gr [c - 945];
1664: 
1665:             else switch (c) {
1666:             // all of the HTMLspecialx.ent entities
1667:             case  338: esc = "OElig";    break;
1668:             case  339: esc = "oelig";    break;
1669:             case  352: esc = "Scaron";    break;
1670:             case  353: esc = "scaron";    break;
1671:             case  376: esc = "Yuml";    break;
1672:             case  710: esc = "circ";    break;
1673:             case  732: esc = "tilde";    break;
1674:             case 8194: esc = "ensp";    break;
1675:             case 8195: esc = "emsp";    break;
1676:             case 8201: esc = "thinsp";    break;
1677:             case 8204: esc = "zwnj";    break;
1678:             case 8205: esc = "zwj";        break;
1679:             case 8206: esc = "lrm";        break;
1680:             case 8207: esc = "rlm";        break;
1681:             case 8211: esc = "ndash";    break;
1682:             case 8212: esc = "mdash";    break;
1683:             case 8216: esc = "lsquo";    break;
1684:             case 8217: esc = "rsquo";    break;
1685:             case 8218: esc = "sbquo";    break;
1686:             case 8220: esc = "ldquo";    break;
1687:             case 8221: esc = "rdquo";    break;
1688:             case 8222: esc = "bdquo";    break;
1689:             case 8224: esc = "dagger";    break;
1690:             case 8225: esc = "Dagger";    break;
1691:             case 8240: esc = "permil";    break;
1692:             case 8249: esc = "lsaquo";    break;
1693:             case 8250: esc = "rsaquo";    break;
1694:             case 8364: esc = "euro";    break;
1695: 
1696:             // the other HTMLsymbox.ent entities
1697:             case  402: esc = "fnof";    break;
1698:             case  977: esc = "thetasym";    break;
1699:             case  978: esc = "upsih";    break;
1700:             case  982: esc = "piv";        break;
1701:             case 8226: esc = "bull";    break;
1702:             case 8230: esc = "hellip";    break;
1703:             case 8242: esc = "prime";    break;
1704:             case 8243: esc = "Prime";    break;
1705:             case 8254: esc = "oline";    break;
1706:             case 8260: esc = "frasl";    break;
1707:             case 8472: esc = "weierp";    break;
1708:             case 8465: esc = "image";    break;
1709:             case 8476: esc = "real";    break;
1710:             case 8482: esc = "trade";    break;
1711:             case 8501: esc = "alefsym";    break;
1712:             case 8592: esc = "larr";    break;
1713:             case 8593: esc = "uarr";    break;
1714:             case 8594: esc = "rarr";    break;
1715:             case 8595: esc = "darr";    break;
1716:             case 8596: esc = "harr";    break;
1717:             case 8629: esc = "crarr";    break;
1718:             case 8656: esc = "lArr";    break;
1719:             case 8657: esc = "uArr";    break;
1720:             case 8658: esc = "rArr";    break;
1721:             case 8659: esc = "dArr";    break;
1722:             case 8660: esc = "hArr";    break;
1723:             case 8704: esc = "forall";    break;
1724:             case 8706: esc = "part";    break;
1725:             case 8707: esc = "exist";    break;
1726:             case 8709: esc = "empty";    break;
1727:             case 8711: esc = "nabla";    break;
1728:             case 8712: esc = "isin";    break;
1729:             case 8713: esc = "notin";    break;
1730:             case 8715: esc = "ni";        break;
1731:             case 8719: esc = "prod";    break;
1732:             case 8721: esc = "sum";        break;
1733:             case 8722: esc = "minus";    break;
1734:             case 8727: esc = "lowast";    break;
1735:             case 8730: esc = "radic";    break;
1736:             case 8733: esc = "prop";    break;
1737:             case 8734: esc = "infin";    break;
1738:             case 8736: esc = "ang";        break;
1739:             case 8743: esc = "and";        break;
1740:             case 8744: esc = "or";        break;
1741:             case 8745: esc = "cap";        break;
1742:             case 8746: esc = "cup";        break;
1743:             case 8747: esc = "int";        break;
1744:             case 8756: esc = "there4";    break;
1745:             case 8764: esc = "sim";        break;
1746:             case 8773: esc = "cong";    break;
1747:             case 8776: esc = "asymp";    break;
1748:             case 8800: esc = "ne";        break;
1749:             case 8801: esc = "equiv";    break;
1750:             case 8804: esc = "le";        break;
1751:             case 8805: esc = "ge";        break;
1752:             case 8834: esc = "sub";        break;
1753:             case 8835: esc = "sup";        break;
1754:             case 8836: esc = "nsub";    break;
1755:             case 8838: esc = "sube";    break;
1756:             case 8839: esc = "supe";    break;
1757:             case 8853: esc = "oplus";    break;
1758:             case 8855: esc = "otimes";    break;
1759:             case 8869: esc = "perp";    break;
1760:             case 8901: esc = "sdot";    break;
1761:             case 8968: esc = "lceil";    break;
1762:             case 8969: esc = "rceil";    break;
1763:             case 8970: esc = "lfloor";    break;
1764:             case 8971: esc = "rfloor";    break;
1765:             case 9001: esc = "lang";    break;
1766:             case 9002: esc = "rang";    break;
1767:             case 9674: esc = "loz";        break;
1768:             case 9824: esc = "spades";    break;
1769:             case 9827: esc = "clubs";    break;
1770:             case 9829: esc = "hearts";    break;
1771:             case 9830: esc = "diams";    break;
1772:             }
1773:         }
1774: 
1775:         // else escape with numeric char refs
1776:         if (esc == null) {
1777:             stringBuf.setLength (0);
1778:             stringBuf.append ("#x");
1779:             stringBuf.append (Integer.toHexString (c).toUpperCase ());
1780:             esc = stringBuf.toString ();
1781: 
1782:             // FIXME:  We don't write surrogate pairs correctly.
1783:             // They should work as one ref per character, since
1784:             // each pair is one character.  For reading back into
1785:             // Unicode, it matters beginning in Unicode 3.1 ...
1786:         }
1787:         break;
1788:         }
1789:         if (i != first)
1790:         rawWrite (buf, off + first, i - first);
1791:         first = i + 1;
1792:         if (esc == eol)
1793:         newline ();
1794:         else {
1795:         rawWrite ('&');
1796:         rawWrite (esc);
1797:         rawWrite (';');
1798:         }
1799:     }
1800:     if (first < len)
1801:         rawWrite (buf, off + first, len - first);
1802:     }
1803: 
1804: 
1805: 
1806:     private void newline ()
1807:     throws SAXException, IOException
1808:     {
1809:     out.write (eol);
1810:     column = 0;
1811:     }
1812: 
1813:     private void doIndent ()
1814:     throws SAXException, IOException
1815:     {
1816:     int    space = elementNestLevel * 2;
1817: 
1818:     newline ();
1819:     column = space;
1820:     // track tabs only at line starts
1821:     while (space > 8) {
1822:         out.write ("\t");
1823:         space -= 8;
1824:     }
1825:     while (space > 0) {
1826:         out.write ("  ");
1827:         space -= 2;
1828:     }
1829:     }
1830: 
1831:     private void rawWrite (char c)
1832:     throws IOException
1833:     {
1834:     out.write (c);
1835:     column++;
1836:     }
1837: 
1838:     private void rawWrite (String s)
1839:     throws SAXException, IOException
1840:     {
1841:     if (prettyPrinting && "default".equals (space.peek ())) {
1842:         char data [] = s.toCharArray ();
1843:         rawWrite (data, 0, data.length);
1844:     } else {
1845:         out.write (s);
1846:         column += s.length ();
1847:     }
1848:     }
1849: 
1850:     // NOTE:  if xhtml, the REC gives some rules about whitespace
1851:     // which we could follow ... notably, many places where conformant
1852:     // agents "must" consolidate/normalize whitespace.  Line ends can
1853:     // be removed there, etc.  This may not be the right place to do
1854:     // such mappings though.
1855: 
1856:     // Line buffering may help clarify algorithms and improve results.
1857: 
1858:     // It's likely xml:space needs more attention.
1859: 
1860:     private void rawWrite (char buf [], int offset, int length)
1861:     throws SAXException, IOException
1862:     {
1863:     boolean        wrap;
1864: 
1865:     if (prettyPrinting && space.empty ())
1866:         fatal ("stack discipline", null);
1867: 
1868:     wrap = prettyPrinting && "default".equals (space.peek ());
1869:     if (!wrap) {
1870:         out.write (buf, offset, length);
1871:         column += length;
1872:         return;
1873:     }
1874: 
1875:     // we're pretty printing and want to fill lines out only
1876:     // to the desired line length.
1877:     while (length > 0) {
1878:         int        target = lineLength - column;
1879:         boolean    wrote = false;
1880: 
1881:         // Do we even have a problem?
1882:         if (target > length || noWrap) {
1883:         out.write (buf, offset, length);
1884:         column += length;
1885:         return;
1886:         }
1887: 
1888:         // break the line at a space character, trying to fill
1889:         // as much of the line as possible.
1890:         char    c;
1891: 
1892:         for (int i = target - 1; i >= 0; i--) {
1893:         if ((c = buf [offset + i]) == ' ' || c == '\t') {
1894:             i++;
1895:             out.write (buf, offset, i);
1896:             doIndent ();
1897:             offset += i;
1898:             length -= i;
1899:             wrote = true;
1900:             break;
1901:         }
1902:         }
1903:         if (wrote)
1904:         continue;
1905:         
1906:         // no space character permitting break before target
1907:         // line length is filled.  So, take the next one.
1908:         if (target < 0)
1909:         target = 0;
1910:         for (int i = target; i < length; i++)
1911:         if ((c = buf [offset + i]) == ' ' || c == '\t') {
1912:             i++;
1913:             out.write (buf, offset, i);
1914:             doIndent ();
1915:             offset += i;
1916:             length -= i;
1917:             wrote = true;
1918:             break;
1919:         }
1920:         if (wrote)
1921:         continue;
1922:         
1923:         // no such luck.
1924:         out.write (buf, offset, length);
1925:         column += length;
1926:         break;
1927:     }
1928:     }
1929: }