Alexx Perez
Alexx Perez

Reputation: 215

Parse xml feed with strings errors

I want to parse a rss feed but it fails at some characters, for example ">" char and all before this char.

Example:

<title>[Maths I &gt; Theory] Maths I, T1.pdf: One file added.</title>

Output:

[Maths I 

This is my RSSHandler:

public class RSSHandler extends DefaultHandler {

final int state_unknown = 0;
final int state_title = 1;
final int state_description = 2;
final int state_link = 3;
final int state_pubdate = 4;
int currentState = state_unknown;

RSSFeed feed;
RSSItem item;

boolean itemFound = false;

RSSHandler(){
}

RSSFeed getFeed(){
return feed;
}

@Override
public void startDocument() throws SAXException {
// TODO Auto-generated method stub
feed = new RSSFeed();
item = new RSSItem();

}

@Override
public void endDocument() throws SAXException {
// TODO Auto-generated method stub
}

@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
// TODO Auto-generated method stub

if (localName.equalsIgnoreCase("item")){
itemFound = true;
item = new RSSItem();
currentState = state_unknown;
}
else if (localName.equalsIgnoreCase("title")){
currentState = state_title;
}
else if (localName.equalsIgnoreCase("description")){
currentState = state_description;
}
else if (localName.equalsIgnoreCase("link")){
currentState = state_link;
}
else if (localName.equalsIgnoreCase("pubdate")){
currentState = state_pubdate;
}
else{
currentState = state_unknown;
}

}

@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
// TODO Auto-generated method stub
if (localName.equalsIgnoreCase("item")){
feed.addItem(item);
}
}

@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
// TODO Auto-generated method stub

String strCharacters = new String(ch,start,length);

if (itemFound==true){
// "item" tag found, it's item's parameter
switch(currentState){
case state_title:
 item.setTitle(strCharacters);
 break;
case state_description:
 item.setDescription(strCharacters);
 break;
case state_link:
 item.setLink(strCharacters);
 break;
case state_pubdate:
 item.setPubdate(strCharacters);
 break;
default:
 break;
}
}
else{
// not "item" tag found, it's feed's parameter
switch(currentState){
case state_title:
 feed.setTitle(strCharacters);
 break;
case state_description:
 feed.setDescription(strCharacters);
 break;
case state_link:
 feed.setLink(strCharacters);
 break;
case state_pubdate:
 feed.setPubdate(strCharacters);
 break;
default:
 break;
}
}

currentState = state_unknown;
}


}

Upvotes: 1

Views: 208

Answers (2)

palacsint
palacsint

Reputation: 28885

Here is a slightly modified version which can parse RSS files well. I hope it helps.

First, a State enum:

public enum State {

    unknown, title, description, link, pubdate

}

Then the handler class:

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class RSSHandler extends DefaultHandler {

    private State currentState = State.unknown;

    private RSSFeed feed;
    private RSSItem item;

    private boolean itemFound = false;

    private StringBuilder tagContent;

    public RSSHandler() {
    }

    @Override
    public void startDocument() throws SAXException {
        feed = new RSSFeed();
        item = new RSSItem();
    }

    @Override
    public void startElement(final String uri, final String localName, 
            final String qName, final Attributes attributes)
            throws SAXException {
        currentState = State.unknown;
        tagContent = new StringBuilder();
        if (localName.equalsIgnoreCase("item")) {
            itemFound = true;
            item = new RSSItem();
            currentState = State.unknown;
        } else if (localName.equalsIgnoreCase("title")) {
            currentState = State.title;
        } else if (localName.equalsIgnoreCase("description")) {
            currentState = State.description;
        } else if (localName.equalsIgnoreCase("link")) {
            currentState = State.link;
        } else if (localName.equalsIgnoreCase("pubdate")) {
            currentState = State.pubdate;
        }
        System.out.println("new state: " + currentState);

    }

    @Override
    public void endElement(final String uri, final String localName, 
            final String qName) throws SAXException {
        if (localName.equalsIgnoreCase("item")) {
            feed.addItem(item);
        }
        if (itemFound == true) {
            // "item" tag found, it's item's parameter
            switch (currentState) {
                case title:
                    item.setTitle(tagContent.toString());
                    break;
                case description:
                    item.setDescription(tagContent.toString());
                    break;
                case link:
                    item.setLink(tagContent.toString());
                    break;
                case pubdate:
                    item.setPubdate(tagContent.toString());
                    break;
                default:
                    break;
            }
        } else {
            // not "item" tag found, it's feed's parameter
            switch (currentState) {
                case title:
                    feed.setTitle(tagContent.toString());
                    break;
                case description:
                    feed.setDescription(tagContent.toString());
                    break;
                case link:
                    feed.setLink(tagContent.toString());
                    break;
                case pubdate:
                    feed.setPubdate(tagContent.toString());
                    break;
                default:
                    break;
            }
        }
    }

    @Override
    public void characters(final char[] ch, final int start, final int length) 
            throws SAXException {
        tagContent.append(ch, start, length);
    }

    public RSSFeed getFeed() {
        return feed;
    }

}

Upvotes: 1

palacsint
palacsint

Reputation: 28885

You get at least 5 callbacks to the characters method:

1st: [Maths I 
2nd: >
3rd: Theory
4th: ]
5th: Maths I, T1.pdf: One file added.

You shouldn't change the currentState in the last line of characters method and you have to buffer the Strings and later concatenate them.

Upvotes: 1

Related Questions