RalfB
RalfB

Reputation: 583

Java: SAX Parsing a huge XML file

I have a 35 GB XML file (yes, some organizations do that and I have no control over it) that I would like to SAX parse. I found an example here:

http://www.java2s.com/Code/Java/XML/SAXDemo.htm

of how to run a SAX parser and avoid loading everything. However, I get an out of memory error immediatly. Why does this happens and how I can make this code perfectly scalable for any XML file size?

Here my code:

import org.apache.log4j.Logger;
import org.xml.sax.AttributeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

public class XMLSAXTools extends org.xml.sax.helpers.DefaultHandler {

/**
 * Logging facility
 */
static Logger logger = Logger.getLogger(XMLSAXTools.class);

private String fileName = "C:/Data/hugefile.xml";
private int counter = 0;

/** The main method sets things up for parsing */
public void test() throws IOException, SAXException,
        ParserConfigurationException {
    // Create a JAXP "parser factory" for creating SAX parsers
    javax.xml.parsers.SAXParserFactory spf = SAXParserFactory.newInstance();

    // Configure the parser factory for the type of parsers we require
    spf.setValidating(false); // No validation required

    // Now use the parser factory to create a SAXParser object
    // Note that SAXParser is a JAXP class, not a SAX class
    javax.xml.parsers.SAXParser sp = spf.newSAXParser();

    // Create a SAX input source for the file argument
    org.xml.sax.InputSource input = new InputSource(new FileReader(fileName));

    // Give the InputSource an absolute URL for the file, so that
    // it can resolve relative URLs in a <!DOCTYPE> declaration, e.g.
    input.setSystemId("file://" + new File(fileName).getAbsolutePath());

    // Create an instance of this class; it defines all the handler methods
    XMLSAXTools handler = new XMLSAXTools();

    // Finally, tell the parser to parse the input and notify the handler
    sp.parse(input, handler);

    // Instead of using the SAXParser.parse() method, which is part of the
    // JAXP API, we could also use the SAX1 API directly. Note the
    // difference between the JAXP class javax.xml.parsers.SAXParser and
    // the SAX1 class org.xml.sax.Parser
    //
    // org.xml.sax.Parser parser = sp.getParser(); // Get the SAX parser
    // parser.setDocumentHandler(handler); // Set main handler
    // parser.setErrorHandler(handler); // Set error handler
    // parser.parse(input); // Parse!
}

StringBuffer accumulator = new StringBuffer(); // Accumulate parsed text

String servletName; // The name of the servlet

String servletClass; // The class name of the servlet

String servletId; // Value of id attribute of <servlet> tag

// When the parser encounters plain text (not XML elements), it calls
// this method, which accumulates them in a string buffer
public void characters(char[] buffer, int start, int length) {
    accumulator.append(buffer, start, length);
}

// Every time the parser encounters the beginning of a new element, it
// calls this method, which resets the string buffer
public void startElement(String name, AttributeList attributes) {
    accumulator.setLength(0); // Ready to accumulate new text
    if (name.equals("item")) {
        logger.info("item tag opened");
        counter++;
    }
}

// When the parser encounters the end of an element, it calls this method
public void endElement(String name) {
    if (name.equals("item")) {
        logger.info("item tag closed. Counter: " + counter);
    }
}

/** This method is called when warnings occur */
public void warning(SAXParseException exception) {
    System.err.println("WARNING: line " + exception.getLineNumber() + ": "
            + exception.getMessage());
}

/** This method is called when errors occur */
public void error(SAXParseException exception) {
    System.err.println("ERROR: line " + exception.getLineNumber() + ": "
            + exception.getMessage());
}

/** This method is called when non-recoverable errors occur. */
public void fatalError(SAXParseException exception) throws SAXException {
    System.err.println("FATAL: line " + exception.getLineNumber() + ": "
            + exception.getMessage());
    throw (exception);
}

public static void main(String[] args){
    XMLSAXTools t = new XMLSAXTools();

    try {
        t.test();
    } catch (Exception e){
        logger.error("Exception in XMLSAXTools: " + e.getMessage());
        e.printStackTrace();
    }

}

}

Upvotes: 2

Views: 4848

Answers (1)

OldCurmudgeon
OldCurmudgeon

Reputation: 65879

You are filling up your accumulator without ever emptying it - this is unlikely to be what you want.

Just using SAX is not sufficient to ensure you do not run out of memory - you still need to implement the code that finds, selects and processes what you do need from the xml and discards the rest.

Here's a fairly simple parser that is designed to be run in a separate thread. It communicates with the calling thread via n ArrayBlockingQueue<String> queue which is defined in an enclosing class.

The huge data files I have to deal with are essentially <Batch> ... a few thousand items ... </Batch>. This parser pulls each item out and presents them one-at-a-time through the blocking queue. One day I will turn them into XOM Elements but atm it uses Strings.

Notice how it clears down its temporary data fields when enque is called to ensure we don't run out of memory:

    private class Parser extends DefaultHandler {
      // Track the depth of the xml - whenever we hit level 1 we add the accumulated xml to the queue.
      private int level = 0;
      // The current xml fragment.
      private final StringBuilder xml = new StringBuilder();
      // We've had a start tag but no data yet.
      private boolean tagWithNoData = false;

      /*
       * Called when the starting of the Element is reached. For Example if we have Tag
       * called <Title> ... </Title>, then this method is called when <Title> tag is
       * Encountered while parsing the Current XML File. The AttributeList Parameter has
       * the list of all Attributes declared for the Current Element in the XML File.
       */
      @Override
      public void startElement(final String uri, final String localName, final String name, final Attributes atrbts) throws SAXException {
        checkForAbort();
        // Have we got back to level 1 yet?
        if (level == 1) {

          // Emit any built ones.
          try {
            enqueue();
          } catch (InterruptedException ex) {
            Throwables.rethrow(ex);
          }
        }

        // Add it on.
        if (level > 0) {
          // The name.
          xml.append("<").append(name);
          // The attributes.
          for (int i = 0; i < atrbts.getLength(); i++) {
            final String att = atrbts.getValue(i);

            xml.append(" ").append(atrbts.getQName(i)).append("=\"").append(XML.to(att)).append("\"");
          }
          // Done.
          xml.append(">");

          // Remember we've not had any data yet.
          tagWithNoData = true;
        }

        // Next element is a sub-element.
        level += 1;
      }

      /*
       * Called when the Ending of the current Element is reached. For example in the
       * above explanation, this method is called when </Title> tag is reached
       */
      @Override
      public void endElement(final String uri, final String localName, final String name) throws SAXException {
        checkForAbort();

        if (level > 1) {
          if (tagWithNoData) {

            // No data. Make the > into a />
            xml.insert(xml.length() - 1, "/");
            // I've closed this one but the enclosing one has data (i.e. this one).
            tagWithNoData = false;
          } else {

            // Had data, finish properly.
            xml.append("</").append(name).append(">");
          }
        }

        // Done with that level.
        level -= 1;

        if (level == 1) {
          // Finished and at level 1.
          try {

            // Enqueue the results.
            enqueue();
          } catch (InterruptedException ex) {
            Throwables.rethrow(ex);
          }
        }
      }

      /*
       * Called when the data part is encountered.
       */
      @Override
      public void characters(final char buf[], final int offset, final int len) throws SAXException {
        checkForAbort();

        // I want it trimmed.
        final String chs = new String(buf, offset, len).trim();

        if (chs.length() > 0) {
          // Grab that data.
          xml.append(XML.to(chs));
          tagWithNoData = false;
        }
      }

      /*
       * Called when the Parser starts parsing the Current XML File.
       */
      @Override
      public void startDocument() throws SAXException {

        checkForAbort();
        tagWithNoData = false;
      }

      /*
       * Called when the Parser Completes parsing the Current XML File.
       */
      @Override
      public void endDocument() throws SAXException {

        checkForAbort();

        try {

          // Enqueue the results.
          enqueue();
        } catch (InterruptedException ex) {
          Throwables.rethrow(ex);
        }
      }

      private void enqueue() throws InterruptedException, SAXException {
        // We may have been closed while blocking on the queue.
        checkForAbort();
        final String x = xml.toString().trim();

        if (x.length() > 0) {
          // Add it to the queue.
          queue.put(x);

          // Clear out.
          xml.setLength(0);
          tagWithNoData = false;

        }
        // We may have been closed while blocking on the queue.
        checkForAbort();
      }

      private void checkForAbort() throws XMLInnerDocumentIteratorAbortedException {
        if (iteratorFinished) {
          LOGGER.debug("Aborting!!!");

          throw new XMLInnerDocumentIterator.XMLInnerDocumentIteratorAbortedException("Aborted!");
        }
      }
    }
  }

Upvotes: 8

Related Questions