/* * Copyright (c) 2008, Harald Kuhr * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name "TwelveMonkeys" nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.twelvemonkeys.xml; import com.twelvemonkeys.lang.StringUtil; import org.w3c.dom.*; import org.xml.sax.SAXException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import java.io.*; import java.nio.charset.Charset; import java.util.Date; /** * XMLSerializer * * @author Harald Kuhr * @author last modified by $Author: haku $ * @version $Id: //depot/branches/personal/haraldk/twelvemonkeys/release-2/twelvemonkeys-core/src/main/java/com/twelvemonkeys/xml/XMLSerializer.java#1 $ */ public class XMLSerializer { // TODO: Replace with DOMSerializer? Test performance, pretty printing etc... // Main problem: Sun's Java 5 does not have LS 3.0 support // This class has no dependencies, which probably makes it more useful // TODO: Don't insert initial and ending line-break for text-nodes // TODO: Support not inserting line-breaks, to preserve space // TODO: Support line breaking (at configurable width) // TODO: Support standalone? // TODO: Support more than version 1.0? // TODO: Consider using IOException to communicate trouble, rather than RTE, // to be more compatible... // TODO: Idea: Create a SerializationContext that stores attributes on // serialization, to keep the serialization thread-safe // Store preserveSpace attribute in this context, to avoid costly traversals // Store user options here too // TODO: Push/pop? private final OutputStream mOutput; private final Charset mEncoding; private final SerializationContext mContext; public XMLSerializer(final OutputStream pOutput, final String pEncoding) { mOutput = pOutput; mEncoding = Charset.forName(pEncoding); mContext = new SerializationContext(); } public final void setIndentation(String pIndent) { mContext.indent = pIndent != null ? pIndent : " "; } public final void setStripComments(boolean pStrip) { mContext.stripComments = pStrip; } /** * Serializes the entire document, along with the XML declaration * ({@code <?xml version="1.0" encoding="..."?>}). * * @param pDocument the document to serialize. */ public void serialize(final Document pDocument) { serialize(pDocument, true); } /** * Serializes the entire sub tree starting at {@code pRootNode}, along with an optional XML declaration * ({@code <?xml version="1.0" encoding="..."?>}). * * @param pRootNode the root node to serialize. * @param pWriteXMLDeclaration {@code true} if the XML declaration should be included, otherwise {@code false}. */ public void serialize(final Node pRootNode, final boolean pWriteXMLDeclaration) { PrintWriter out = new PrintWriter(new OutputStreamWriter(mOutput, mEncoding)); try { if (pWriteXMLDeclaration) { writeXMLDeclaration(out); } writeXML(out, pRootNode, mContext.copy()); } finally { out.flush(); } } private void writeXMLDeclaration(final PrintWriter pOut) { pOut.print(""); } private void writeXML(final PrintWriter pOut, final Node pDocument, final SerializationContext pContext) { writeNodeRecursive(pOut, pDocument, pContext); } private void writeNodeRecursive(final PrintWriter pOut, final Node pNode, final SerializationContext pContext) { if (pNode.getNodeType() != Node.TEXT_NODE) { indentToLevel(pOut, pContext); } switch (pNode.getNodeType()) { case Node.DOCUMENT_NODE: case Node.DOCUMENT_FRAGMENT_NODE: writeDocument(pOut, pNode, pContext); break; case Node.DOCUMENT_TYPE_NODE: writeDoctype(pOut, (DocumentType) pNode); break; case Node.ELEMENT_NODE: boolean preserveSpace = pContext.preserveSpace; updatePreserveSpace(pNode, pContext); writeElement(pOut, (Element) pNode, pContext); pContext.preserveSpace = preserveSpace; break; case Node.CDATA_SECTION_NODE: writeCData(pOut, pNode); break; case Node.TEXT_NODE: writeText(pOut, pNode, pContext); break; case Node.COMMENT_NODE: writeComment(pOut, pNode, pContext); break; case Node.PROCESSING_INSTRUCTION_NODE: writeProcessingInstruction(pOut, (ProcessingInstruction) pNode); break; case Node.ATTRIBUTE_NODE: throw new IllegalArgumentException("Malformed input Document: Attribute nodes should only occur inside Element nodes"); case Node.ENTITY_NODE: // '' case Node.ENTITY_REFERENCE_NODE: // ( '&' | '%' ) + getNodeName + ';' case Node.NOTATION_NODE: // '' default: throw new InternalError("Lazy programmer never implemented serialization of " + pNode.getClass()); } } private void writeProcessingInstruction(final PrintWriter pOut, final ProcessingInstruction pNode) { pOut.print("\n"); } private void writeText(final PrintWriter pOut, final Node pNode, final SerializationContext pContext) { // TODO: Is this really as specified? String value = pNode.getNodeValue(); if (pContext.preserveSpace) { pOut.print(maybeEscapeElementValue(value)); } else if (!StringUtil.isEmpty(value)) { String escapedValue = maybeEscapeElementValue(value.trim()); //if (escapedValue.length() + (pContext.level * pContext.indent.length()) > 78) { indentToLevel(pOut, pContext); //} pOut.println(escapedValue); } } private void writeCData(final PrintWriter pOut, final Node pNode) { pOut.print(""); } private static void updatePreserveSpace(final Node pNode, final SerializationContext pContext) { NamedNodeMap attributes = pNode.getAttributes(); if (attributes != null) { Node space = attributes.getNamedItem("xml:space"); if (space != null) { if ("preserve".equals(space.getNodeValue())) { pContext.preserveSpace = true; } else if ("default".equals(space.getNodeValue())) { pContext.preserveSpace = false; } // No other values are allowed per spec, ignore } } } private static void indentToLevel(final PrintWriter pOut, final SerializationContext pContext) { for (int i = 0; i < pContext.level; i++) { pOut.print(pContext.indent); } } private void writeComment(final PrintWriter pOut, final Node pNode, final SerializationContext pContext) { if (pContext.stripComments) { return; } String value = pNode.getNodeValue(); validateCommentValue(value); if (value.startsWith(" ")) { pOut.print(""); } else { pOut.println(" -->"); } } /** * Returns an escaped version of the input string. The string is guaranteed * to not contain illegal XML characters ({@code &<>}). * If no escaping is needed, the input string is returned as is. * * @param pValue the input string that might need escaping. * @return an escaped version of the input string. */ static String maybeEscapeElementValue(final String pValue) { int startEscape = needsEscapeElement(pValue); if (startEscape < 0) { // If no escaping is needed, simply return original return pValue; } else { // Otherwise, start replacing StringBuilder builder = new StringBuilder(pValue.substring(0, startEscape)); builder.ensureCapacity(pValue.length() + 30); int pos = startEscape; for (int i = pos; i < pValue.length(); i++) { switch (pValue.charAt(i)) { case '&': pos = appendAndEscape(pValue, pos, i, builder, "&"); break; case '<': pos = appendAndEscape(pValue, pos, i, builder, "<"); break; case '>': pos = appendAndEscape(pValue, pos, i, builder, ">"); break; //case '\'': // pos = appendAndEscape(pString, pos, i, builder, "'"); // break; //case '"': // pos = appendAndEscape(pString, pos, i, builder, """); // break; default: break; } } builder.append(pValue.substring(pos)); return builder.toString(); } } private static int appendAndEscape(final String pString, int pStart, final int pEnd, final StringBuilder pBuilder, final String pEntity) { pBuilder.append(pString.substring(pStart, pEnd)); pBuilder.append(pEntity); return pEnd + 1; } /** * Returns an the first index from the input string that should be escaped * if escaping is needed, otherwise {@code -1}. * * @param pString the input string that might need escaping. * @return the first index from the input string that should be escaped, * or {@code -1}. */ private static int needsEscapeElement(final String pString) { for (int i = 0; i < pString.length(); i++) { switch (pString.charAt(i)) { case '&': case '<': case '>': //case '\'': //case '"': return i; default: } } return -1; } private static String maybeEscapeAttributeValue(final String pValue) { int startEscape = needsEscapeAttribute(pValue); if (startEscape < 0) { return pValue; } else { StringBuilder builder = new StringBuilder(pValue.substring(0, startEscape)); builder.ensureCapacity(pValue.length() + 16); int pos = startEscape; for (int i = pos; i < pValue.length(); i++) { switch (pValue.charAt(i)) { case '&': pos = appendAndEscape(pValue, pos, i, builder, "&"); break; case '"': pos = appendAndEscape(pValue, pos, i, builder, """); break; default: break; } } //StringBuilder builder = new StringBuilder(pValue.length() + 30); // //int start = 0; //while (end >= 0) { // builder.append(pValue.substring(start, end)); // builder.append("""); // start = end + 1; // end = pValue.indexOf('"', start); //} //builder.append(pValue.substring(start)); builder.append(pValue.substring(pos)); return builder.toString(); } } /** * Returns an the first index from the input string that should be escaped * if escaping is needed, otherwise {@code -1}. * * @param pString the input string that might need escaping. * @return the first index from the input string that should be escaped, * or {@code -1}. */ private static int needsEscapeAttribute(final String pString) { for (int i = 0; i < pString.length(); i++) { switch (pString.charAt(i)) { case '&': //case '<': //case '>': //case '\'': case '"': return i; default: } } return -1; } private static String validateCDataValue(final String pValue) { if (pValue.indexOf("]]>") >= 0) { throw new IllegalArgumentException("Malformed input document: CDATA block may not contain the string ']]>'"); } return pValue; } private static String validateCommentValue(final String pValue) { if (pValue.indexOf("--") >= 0) { throw new IllegalArgumentException("Malformed input document: Comment may not contain the string '--'"); } return pValue; } private void writeDocument(final PrintWriter pOut, final Node pNode, final SerializationContext pContext) { // Document fragments might not have child nodes... if (pNode.hasChildNodes()) { NodeList nodes = pNode.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { writeNodeRecursive(pOut, nodes.item(i), pContext); } } } private void writeElement(final PrintWriter pOut, final Element pNode, final SerializationContext pContext) { pOut.print("<"); pOut.print(pNode.getTagName()); // TODO: Attributes should probably include namespaces, so that it works // even if the document was created using attributes instead of namespaces... // In that case, prefix will be null... // TODO: Don't insert duplicate/unnecessary namesspace declarations // Handle namespace String namespace = pNode.getNamespaceURI(); if (namespace != null && !namespace.equals(pContext.defaultNamespace)) { String prefix = pNode.getPrefix(); if (prefix == null) { pContext.defaultNamespace = namespace; pOut.print(" xmlns"); } else { pOut.print(" xmlns:"); pOut.print(prefix); } pOut.print("=\""); pOut.print(namespace); pOut.print("\""); } // Iterate attributes if any if (pNode.hasAttributes()) { NamedNodeMap attributes = pNode.getAttributes(); for (int i = 0; i < attributes.getLength(); i++) { Attr attribute = (Attr) attributes.item(i); String name = attribute.getName(); if (!(name.startsWith("xmlns") && (name.length() == 5 || name.charAt(5) == ':'))) { pOut.print(" "); pOut.print(name); pOut.print("=\""); pOut.print(maybeEscapeAttributeValue(attribute.getValue())); pOut.print("\""); } //else { // System.err.println("attribute.getName(): " + name); //} } } // TODO: Consider not indenting/newline if the first child is a text node // Iterate children if any if (pNode.hasChildNodes()) { pOut.print(">"); if (!pContext.preserveSpace) { pOut.println(); } NodeList children = pNode.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { writeNodeRecursive(pOut, children.item(i), pContext.push()); } if (!pContext.preserveSpace) { indentToLevel(pOut, pContext); } pOut.print(""); } else { pOut.println("/>"); } } private void writeDoctype(final PrintWriter pOut, final DocumentType pDoctype) { // NOTE: The DOMImplementationLS LSSerializer actually inserts SYSTEM or // PUBLIC identifiers even if they are empty strings. The result is, it // will create invalid documents. // Testing for empty strings seems to be more compatible. if (pDoctype != null) { pOut.print(""); } } public static void main(String[] pArgs) throws IOException, SAXException { // Build XML tree (Document) and write // Find the implementation DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); DocumentBuilder builder; try { builder = factory.newDocumentBuilder(); } catch (ParserConfigurationException e) { //noinspection ThrowableInstanceNeverThrown BOGUS throw (IOException) new IOException(e.getMessage()).initCause(e); } DOMImplementation dom = builder.getDOMImplementation(); Document document = dom.createDocument("http://www.twelvemonkeys.com/xml/test", "test", dom.createDocumentType("test", null, null)); Element root = document.getDocumentElement(); // This is probably not the correct way of setting a default namespace //root.setAttribute("xmlns", "http://www.twelvemonkeys.com/xml/test"); // Create and insert the normal Properties headers as XML comments document.insertBefore(document.createComment(new Date().toString()), root); Element test = document.createElement("sub"); root.appendChild(test); Element more = document.createElementNS("http://more.com/1999/namespace", "more:more"); more.setAttribute("foo", "test"); more.setAttribute("bar", "'really' \"legal\" & ok"); test.appendChild(more); more.appendChild(document.createTextNode("Simply some text.")); more.appendChild(document.createCDATASection("&something escaped;")); more.appendChild(document.createTextNode("More & !")); more.appendChild(document.createTextNode("\"<<'&'>>\"")); Element another = document.createElement("another"); test.appendChild(another); Element yet = document.createElement("yet-another"); yet.setAttribute("this-one", "with-params"); test.appendChild(yet); Element pre = document.createElementNS("http://www.twelvemonkeys.com/xml/test", "pre"); pre.setAttributeNS("http://www.w3.org/XML/1998/namespace", "xml:space", "preserve"); pre.appendChild(document.createTextNode(" \t \n\r some text & white ' ' \n ")); test.appendChild(pre); // Create serializer and output document //XMLSerializer serializer = new XMLSerializer(pOutput, new OutputFormat(document, UTF_8_ENCODING, true)); System.out.println("XMLSerializer:"); XMLSerializer serializer = new XMLSerializer(System.out, "UTF-8"); serializer.serialize(document); System.out.println(); System.out.println("DOMSerializer:"); DOMSerializer serializerD = new DOMSerializer(System.out, "UTF-8"); serializerD.setPrettyPrint(true); serializerD.serialize(document); System.out.println(); System.out.println("\n"); ByteArrayOutputStream out = new ByteArrayOutputStream(); XMLSerializer serializer2 = new XMLSerializer(out, "UTF-8"); serializer2.serialize(document); ByteArrayOutputStream outD = new ByteArrayOutputStream(); DOMSerializer serializer2D = new DOMSerializer(outD, "UTF-8"); serializer2D.serialize(document); Document document2 = builder.parse(new ByteArrayInputStream(out.toByteArray())); System.out.println("XMLSerializer reparsed XMLSerializer:"); serializer.serialize(document2); System.out.println(); System.out.println("DOMSerializer reparsed XMLSerializer:"); serializerD.serialize(document2); System.out.println(); Document documentD = builder.parse(new ByteArrayInputStream(outD.toByteArray())); System.out.println("XMLSerializer reparsed DOMSerializer:"); serializer.serialize(documentD); System.out.println(); System.out.println("DOMSerializer reparsed DOMSerializer:"); serializerD.serialize(documentD); System.out.println(); } static class SerializationContext implements Cloneable { String indent = " "; int level = 0; boolean preserveSpace = false; boolean stripComments = false; String defaultNamespace; public SerializationContext copy() { try { return (SerializationContext) clone(); } catch (CloneNotSupportedException e) { throw new Error(e); } } public SerializationContext push() { SerializationContext context = copy(); context.level++; return context; } } }