Reputation: 158261
I have the below element in a web service response. As you can see, it's escaped XML dumped as CDATA, so the XML parser just looks at it as a string and I'm unable to get the data I need from it through the usual means of XSLT and XPath. I need to turn this ugly string back into XML so that I can read it properly.
I have tried to do a search replace and simply converted all <
to <
and >
to >
and this works great, but there is a problem: The message.body
element can actually contain HTML which is not valid XML. Might not even be valid HTML for all I know. So if I just replace everything, this will probably crash when I try to turn the string back into an XML document.
How can I unescape this safely? Is there a good way to do the replacement in the whole string except between the message.body
open and closing tags for example?
<output><item type="object">
<ticket.id type="string">171</ticket.id>
<ticket.title type="string">SoapUI Test</ticket.title>
<ticket.created_at type="string">2013-12-03 12:50:54</ticket.created_at>
<ticket.status type="string">Open</ticket.status>
<updated type="string">false</updated>
<message type="object">
<message.id type="string">520</message.id>
<message.created_at type="string">2013-12-03 12:50:54.000</message.created_at>
<message.author type="string"/>
<message.body type="string">Just a test message...</message.body>
</message>
<message type="object">
<message.id type="string">521</message.id>
<message.created_at type="string">2013-12-03 13:58:32.000</message.created_at>
<message.author type="string"/>
<message.body type="string">Another message!</message.body>
</message>
</item>
</output>
Upvotes: 0
Views: 2502
Reputation: 158261
This is my current solution. You give it an XPath for the nodes that are messed up and a set of element names that might include messed up HTML and other problems. Works roughly as follows
The regex solution in step 2 is probably not fool-proof, but don't really see a better solution at the moment. If you do, let me know!
CDataFixer
import java.util.*;
import javax.xml.xpath.*;
import org.w3c.dom.*;
public class CDataFixer
{
private final XmlHelper xml = XmlHelper.getInstance();
public Document fix(Document document, String nodesToFix, Set<String> excludes) throws XPathExpressionException, XmlException
{
return fix(document, xml.newXPath().compile(nodesToFix), excludes);
}
private Document fix(Document document, XPathExpression nodesToFix, Set<String> excludes) throws XPathExpressionException, XmlException
{
Document wc = xml.copy(document);
NodeList nodes = (NodeList) nodesToFix.evaluate(wc, XPathConstants.NODESET);
int nodeCount = nodes.getLength();
for(int n=0; n<nodeCount; n++)
parse(nodes.item(n), excludes);
return wc;
}
private void parse(Node node, Set<String> excludes) throws XmlException
{
String text = node.getTextContent();
for(String exclude : excludes)
{
String regex = String.format("(?s)(<%1$s\\b[^>]*>)(.*?)(</%1$s>)", Pattern.quote(exclude));
text = text.replaceAll(regex, "$1<![CDATA[$2]]>$3");
}
String randomNode = "tmp_"+UUID.randomUUID().toString();
text = String.format("<%1$s>%2$s</%1$s>", randomNode, text);
NodeList parsed = xml
.parse(text)
.getFirstChild()
.getChildNodes();
node.setTextContent(null);
for(int n=0; n<parsed.getLength(); n++)
node.appendChild(node.getOwnerDocument().importNode(parsed.item(n), true));
}
}
XmlHelper
import java.io.*;
import javax.xml.parsers.*;
import javax.xml.transform.*;
import javax.xml.transform.dom.*;
import javax.xml.transform.sax.*;
import javax.xml.transform.stream.*;
import javax.xml.xpath.*;
import org.w3c.dom.*;
import org.xml.sax.*;
public final class XmlHelper
{
private static final XmlHelper instance = new XmlHelper();
public static XmlHelper getInstance()
{
return instance;
}
private final SAXTransformerFactory transformerFactory;
private final DocumentBuilderFactory documentBuilderFactory;
private final XPathFactory xpathFactory;
private XmlHelper()
{
documentBuilderFactory = DocumentBuilderFactory.newInstance();
documentBuilderFactory.setNamespaceAware(true);
xpathFactory = XPathFactory.newInstance();
TransformerFactory tf = TransformerFactory.newInstance();
if (!tf.getFeature(SAXTransformerFactory.FEATURE))
throw new RuntimeException("Failed to create SAX-compatible TransformerFactory.");
transformerFactory = (SAXTransformerFactory) tf;
}
public DocumentBuilder newDocumentBuilder()
{
try
{
return documentBuilderFactory.newDocumentBuilder();
}
catch (ParserConfigurationException e)
{
throw new RuntimeException("Failed to create new "+DocumentBuilder.class, e);
}
}
public XPath newXPath()
{
return xpathFactory.newXPath();
}
public Transformer newIdentityTransformer(boolean omitXmlDeclaration, boolean indent)
{
try
{
Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, indent ? "yes" : "no");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDeclaration ? "yes" : "no");
return transformer;
}
catch (TransformerConfigurationException e)
{
throw new RuntimeException("Failed to create Transformer instance: "+e.getMessage(), e);
}
}
public Templates newTemplates(String xslt) throws XmlException
{
try
{
return transformerFactory.newTemplates(new DOMSource(parse(xslt)));
}
catch (TransformerConfigurationException e)
{
throw new RuntimeException("Failed to create templates: "+e.getMessage(), e);
}
}
public Document parse(String xml) throws XmlException
{
return parse(new InputSource(new StringReader(xml)));
}
public Document parse(InputSource xml) throws XmlException
{
try
{
return newDocumentBuilder().parse(xml);
}
catch (SAXException e)
{
throw new XmlException("Failed to parse xml: "+e.getMessage(), e);
}
catch (IOException e)
{
throw new XmlException("Failed to read xml: "+e.getMessage(), e);
}
}
public String toString(Node node)
{
return toString(node, true, false);
}
public String toString(Node node, boolean omitXMLDeclaration, boolean indent)
{
try
{
StringWriter writer = new StringWriter();
newIdentityTransformer(omitXMLDeclaration, indent)
.transform(new DOMSource(node), new StreamResult(writer));
return writer.toString();
}
catch (TransformerException e)
{
throw new RuntimeException("Failed to transform XML into string: " + e.getMessage(), e);
}
}
public Document copy(Document document)
{
DOMSource source = new DOMSource(document);
DOMResult result = new DOMResult();
try
{
newIdentityTransformer(true, false)
.transform(source, result);
return (Document) result.getNode();
}
catch (TransformerException e)
{
throw new RuntimeException("Failed to copy XML: " + e.getMessage(), e);
}
}
}
Upvotes: 0
Reputation: 1887
This is actually lifted from the project i'm working on right now.
private Node stringToNode(String textContent) {
Element node = null;
try {
node = DocumentBuilderFactory.newInstance().newDocumentBuilder()
.parse(new ByteArrayInputStream(textContent.getBytes()))
.getDocumentElement();
} catch (SAXException e) {
logger.error(e.getMessage(), e);
} catch (IOException e) {
logger.error(e.getMessage(), e);
} catch (ParserConfigurationException e) {
logger.error(e.getMessage(), e);
}
return node;
}
This will give you a document object representing the string. I use this to get this back into the original document:
if (textContent.contains(XML_HEADER)) {
textContent = textContent.substring(textContent.indexOf(XML_HEADER) + XML_HEADER.length());
}
Node newNode = stringToNode(textContent);
if (newNode != null) {
Node importedNode = soapBody.getOwnerDocument().importNode(newNode, true);
nextChild.setTextContent(null);
nextChild.appendChild(importedNode);
}
Upvotes: 0