Reputation: 61
I'm processing XML documents to submit to HMRC in the UK. These documents need to be processed and have a hash code generated for them.
I've written the code below which works well for small documents. However if I process a 60mb xml file it uses around 1.2gb of memory.
I've had a look to see if there is anyway I can make it more efficient, but can't see anything. It needs to be able to remove the IRMark element if it already exists.
Any ideas are much appreciated. Thanks.
/*
** Generates the HMRC IRMARK as required.
*/
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.*;
import org.apache.xml.security.Init;
import org.apache.xml.security.c14n.CanonicalizationException;
import org.apache.xml.security.c14n.Canonicalizer;
import org.apache.xml.security.c14n.InvalidCanonicalizerException;
import org.bouncycastle.util.encoders.Base64;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
public class IRMarkDOS
{
/**
* @param args
*/
public static void main(String[] args)
{
// Initialise Apache XML tools
Init.init();
// Start tracking execution time
long start = System.currentTimeMillis();
try
{
// Validate/parse the command line
if (args.length != 3)
{
System.out.println("INCORRECT PARAMETERS SPECIFIED" + System.getProperty("line.separator") + " Specify IRMark.exe <InputFile> <OutputFile> " + "<TaxNamespace>");
return;
}
// Set input/output variables
String sInput = args[0];
String sOutput = args[1];
String sTaxNamespace = args[2];
// Read the XML Document
//Document xmlDoc = IRMarkDOS.processXML(sInput, sTaxNamespace);
String xml = IRMarkDOS.processXML(sInput, sTaxNamespace);
System.gc();
// Generate the IRMark
String strIRMark = IRMarkDOS.generateIRMark(xml);
// Write to file
PrintWriter out = new PrintWriter(new FileOutputStream(sOutput));
out.println(strIRMark);
out.close();
System.out.println("IRmark64: " + strIRMark);
// Output execution time
long end = System.currentTimeMillis();
System.out.println("Execution Time " + ((end-start) / 1000) + " seconds, " + (end-start) + " milliseconds");
}
catch (RuntimeException ex)
{
System.out.println(ex.getMessage());
System.exit(1);
}
catch (OutOfMemoryError ex)
{
System.out.println(ex.getMessage());
System.exit(1);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
*
* Processes the specified XML document
*
* @param sInput - XML Document
* @param sTaxNamespace - TaxNamespace
* @param sEnvelopeNamespace - EnvelopeNamespace
* @return
*/
private static String processXML (String sInput, String sTaxNamespace)
{
try
{
// Read XML
File xmlDocument=new File(sInput);
DocumentBuilderFactory xmlDomFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder xmlBuilder = xmlDomFactory.newDocumentBuilder();
Document xmlDoc = xmlBuilder.parse(xmlDocument);
// Setup XPath
XPathFactory factory=XPathFactory.newInstance();
XPath xPath=factory.newXPath();
Node body = (Node) xPath.evaluate("/GovTalkMessage/Body", xmlDoc, XPathConstants.NODE);
//Get IRMark and remove if exists
Node irmark = (Node) xPath.evaluate("/GovTalkMessage/Body/IRenvelope/IRheader/IRmark", xmlDoc, XPathConstants.NODE);
if (irmark != null)
{
System.out.println("Original IRMark: " + irmark.getTextContent());
irmark.getParentNode().removeChild(irmark);
}
irmark = null;
// Create new doc for body and add envelope namespace to body as required
xmlDoc = null;
xmlDoc = xmlBuilder.newDocument();
Node tmp = xmlDoc.importNode(body, true);
xmlDoc.appendChild(tmp);
tmp = null;
// Add namespace to body element
xmlDoc.getDocumentElement().setAttributeNS("http://www.w3.org/2000/xmlns/", "xmlns", "http://www.govtalk.gov.uk/CM/envelope");
return IRMarkDOS.getOuterXml(xmlDoc);
}
catch (RuntimeException ex)
{
System.out.println(ex.getMessage());
System.exit(1);
} catch (ParserConfigurationException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (XPathExpressionException e) {
e.printStackTrace();
}
return null;
}
/**
* Generates the IRMark for the specified XML Document
*
* @param xmlDoc - XML Document to generate the IRMark for
* @return - The generated IRMark
*/
private static String generateIRMark(String bodyText1)
{
// Get XML string
//String bodyText1 = IRMarkDOS.getOuterXml(xmlDoc);
// Final Data Tweaks
bodyText1 = bodyText1.toString();
bodyText1 = bodyText1.replace("
", "");
bodyText1 = bodyText1.replace("\r\n", "\n");
bodyText1 = bodyText1.replace("\r", "\n");
try
{
// Convert the final document back into a byte array encoded as UTF8
byte[] bodyBytes = bodyText1.getBytes("UTF8");
// Canonicalisation to C14n
Canonicalizer c14n = Canonicalizer.getInstance("http://www.w3.org/TR/2001/REC-xml-c14n-20010315");
byte[] bodyCanonical = c14n.canonicalize(bodyBytes);
// Generate SHA 1 and convert to Base64
MessageDigest md1 = MessageDigest.getInstance("SHA");
md1.update(bodyCanonical); //bodyBytes
byte[] digest1 = md1.digest();
String strIRmark = new String(Base64.encode(digest1));
return strIRmark;
}
catch (RuntimeException ex)
{
System.out.println(ex.getMessage());
System.exit(1);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (InvalidCanonicalizerException e) {
e.printStackTrace();
} catch (CanonicalizationException e) {
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
return "";
}
/**
* Converts a XML Node to a string representation
*
* @param node - XML Node to convert to String
* @return - A string representation of the XML Node
*/
private static String getOuterXml(Node node)
{
try
{
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty("omit-xml-declaration", "yes");
StringWriter writer = new StringWriter();
transformer.transform(new DOMSource(node), new StreamResult(writer));
return writer.toString();
}
catch (Exception e)
{
e.printStackTrace();
}
return "";
}
}
Upvotes: 2
Views: 4525
Reputation: 21449
DOM XML trees are fine when dealing with small files, that's because when you load an XML file using DOM API, a tree is created to hold the whole document structure in memory. When you deal with huge amounts of XML data, you should use XML streams instead. XML streaming uses minimal amounts of memory and will give you flexibility of reading and processing your XML data in a simple a fashionable way.
Upvotes: 0
Reputation: 6783
I would also suggest the following modification for your processXML() method:
private static String processXML (String sInput, String sTaxNamespace){
Document xmlDoc = null;
try{
/**
* Your processing logic
*/
}catch(Exception exe){
//Catch your individual exceptions
}finally{
if(xmlDoc != null){
xmlDoc = null;
System.runFinalization();
System.gc();
}
}
}
This way, you can ensure that your Document might be garbage collected after you are done processing the same.
Upvotes: 0
Reputation: 309008
You could try just using a SAX parser and responding to specific element opening and closing events. Perhaps you can accomplish what you need to do without holding the entire DOM in memory at once that way.
Upvotes: 3