Frames | No Frames |
1: /* DomHTMLParser.java -- 2: Copyright (C) 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package gnu.xml.dom.html2; 40: 41: import gnu.javax.swing.text.html.parser.support.Parser; 42: 43: import java.io.IOException; 44: import java.io.Reader; 45: 46: import java.util.Enumeration; 47: import java.util.Iterator; 48: import java.util.LinkedList; 49: 50: import javax.swing.text.AttributeSet; 51: import javax.swing.text.html.HTML; 52: import javax.swing.text.html.parser.DTD; 53: import javax.swing.text.html.parser.TagElement; 54: 55: import org.w3c.dom.NamedNodeMap; 56: import org.w3c.dom.Node; 57: import org.w3c.dom.html2.HTMLDocument; 58: 59: /** 60: * This parser reads HTML from the given stream and stores into 61: * {@link HTMLDocument}. The HTML tag becomes the {@link Node}. 62: * The tag attributes become the node attributes. The text inside 63: * HTML tag is inserted as one or several text nodes. The nested 64: * HTML tags are inserted as child nodes. 65: * 66: * If the strict tree structure, closing the tag means closing all 67: * nested tags. To work around this, this parser closes the nested 68: * tags and immediately reopens them after the closed tag. 69: * In this way, <code><b><i>c</b>d</code> 70: * is parsed as <code><b><i>c</i></b><i>d</code> . 71: * 72: * @author Audrius Meskauskas (AudriusA@Bioinformatics.org) 73: */ 74: public class DomHTMLParser 75: extends gnu.javax.swing.text.html.parser.support.Parser 76: { 77: /** 78: * The target where HTML document will be inserted. 79: */ 80: protected DomHTMLDocument document; 81: 82: /** 83: * The subsequently created new nodes will be inserted as the 84: * childs of this cursor. 85: */ 86: protected Node cursor; 87: 88: /** 89: * Create parser using the given DTD. 90: * 91: * @param dtd the DTD (for example, 92: * {@link gnu.javax.swing.text.html.parser.HTML_401F}). 93: */ 94: public DomHTMLParser(DTD dtd) 95: { 96: super(dtd); 97: } 98: 99: /** 100: * Parse SGML insertion ( <! ... > ). 101: * Currently just treats it as comment. 102: */ 103: public boolean parseMarkupDeclarations(StringBuffer strBuff) 104: throws java.io.IOException 105: { 106: Node c = document.createComment(strBuff.toString()); 107: cursor.appendChild(c); 108: return false; 109: } 110: 111: /** 112: * Read the document, present in the given stream, and 113: * return the corresponding {@link HTMLDocument}. 114: * 115: * @param input a stream to read from. 116: * @return a document, reflecting the structure of the provided HTML 117: * text. 118: * 119: * @throws IOException if the reader throws one. 120: */ 121: public HTMLDocument parseDocument(Reader input) 122: throws IOException 123: { 124: try 125: { 126: document = new DomHTMLDocument(); 127: document.setCheckWellformedness(false); 128: document.setCheckingCharacters(false); 129: 130: cursor = document; 131: 132: parse(input); 133: 134: DomHTMLDocument h = document; 135: document = null; 136: return h; 137: } 138: catch (Exception ex) 139: { 140: ex.printStackTrace(); 141: throw new IOException("Exception: " + ex.getMessage()); 142: } 143: } 144: 145: /** 146: * Create a new node. 147: * @param name the name of node, case insensitive. 148: * @return the created node. 149: */ 150: protected Node createNode(String name) 151: { 152: Node new_node = document.createElement(name.toLowerCase()); 153: AttributeSet hatts = getAttributes(); 154: NamedNodeMap natts = new_node.getAttributes(); 155: 156: Enumeration enumeration = hatts.getAttributeNames(); 157: Object key; 158: Node attribute; 159: 160: while (hatts != null) 161: { 162: while (enumeration.hasMoreElements()) 163: { 164: key = enumeration.nextElement(); 165: attribute = document.createAttribute(key.toString()); 166: attribute.setNodeValue(hatts.getAttribute(key).toString()); 167: natts.setNamedItem(attribute); 168: } 169: 170: // The default values are stored in a parent node. 171: hatts = hatts.getResolveParent(); 172: } 173: 174: return new_node; 175: } 176: 177: /** 178: * Handle comment by inserting the comment node. 179: * @param text the comment text. 180: */ 181: protected void handleComment(char[] text) 182: { 183: Node c = document.createComment(new String(text)); 184: cursor.appendChild(c); 185: } 186: 187: /** 188: * Handle the tag with no content. 189: * @param tag the tag to handle. 190: */ 191: protected void handleEmptyTag(TagElement tag) 192: { 193: String name = tag.getHTMLTag().toString(); 194: 195: if (name.equalsIgnoreCase("#pcdata")) 196: return; 197: 198: Node c = createNode(name); 199: cursor.appendChild(c); 200: } 201: 202: /** 203: * Close the given tag. Close and reopen all nested tags. 204: * @param tag the tag to close. 205: */ 206: protected void handleEndTag(TagElement tag) 207: { 208: String name = tag.getHTMLTag().toString(); 209: String nname = cursor.getNodeName(); 210: 211: // Closing the current tag. 212: if (nname != null && nname.equalsIgnoreCase(name)) 213: { 214: cursor = cursor.getParentNode(); 215: } 216: else 217: { 218: Node nCursor = cursor.getParentNode(); 219: 220: // Remember the opened nodes. 221: LinkedList open = new LinkedList(); 222: Node close = cursor; 223: while (close != null && !close.getNodeName().equalsIgnoreCase(name)) 224: { 225: if (close != document) 226: open.addFirst(close); 227: close = close.getParentNode(); 228: } 229: if (close == null) 230: cursor = document; 231: else 232: cursor = close.getParentNode(); 233: 234: // Insert the copies of the opened nodes. 235: Iterator iter = open.iterator(); 236: while (iter.hasNext()) 237: { 238: Node item = (Node) iter.next(); 239: cursor.appendChild(item); 240: cursor = item; 241: } 242: } 243: } 244: 245: /** 246: * Handle the start tag by inserting the HTML element. 247: * @param tag the tag to handle. 248: */ 249: protected void handleStartTag(TagElement tag) 250: { 251: HTML.Tag h = tag.getHTMLTag(); 252: Node c = createNode(h.toString()); 253: cursor.appendChild(c); 254: cursor = c; 255: } 256: 257: /** 258: * Handle text by inserting the text node. 259: * @param text the text to insert. 260: */ 261: protected void handleText(char[] text) 262: { 263: Node c = document.createTextNode(text, 0, text.length); 264: cursor.appendChild(c); 265: } 266: }