How to create a html tree?

Question

I need to dig a bit in some html files and I wanted to first transform them into the readible form of a tree one tag at a line. Nevertheless I have no experience in html. Could someone correnct my code and point out the rules I've forgotten?

My code does not work for for real life pages. At the end of the program execution the nesting counter should be set to 0, as program should leave all the nested tags it has met. It does not. For a facebook page it is more than 2000 tags remaining open.

Before one would suggest me using a library, I haven't seen any good one out there. For my pages transforming into xml somehow fails and htmlcxx library has no proper documentation.

#include 

char get_char( FILE *stream ) {
    char c;
    do
        c = getc(stream);
    while ( c == ' ' || c == '
' || c == '	' || c == '
' );
    return c;
}

void fun( FILE *stream, FILE *out ) {   
    int counter = -1;
    char c;

    do {
        c = get_char(stream);
        if ( c == EOF )
            break;

        if ( c != '<' ) { // print text
            for ( int i = counter + 1; i; --i )
                putc( ' ', out );
            fprintf( out, "TEXT: " );
            do {
                if ( c == '
' )
                    fprintf( out, "
" ); // random separator
                else
                    putc( c, out );
                c = getc( stream );
            } while ( c != '<' );
            putc( '
', out );
        }

        c = getc( stream );
        if ( c != '/' ) { // nest deeper
            ++counter;
            for ( int i = counter; i; --i )
                putc( ' ', out );
        } else { // go back in nesting
            --counter;
            // maybe here should be some exception handling
            do // assuming there's no strings in quotation marks here
                c = getc( stream );
            while ( c != '>' );
            continue;
        }

        ungetc( c, stream );
        do { // reading tag
            c = getc(stream);
            if( c == '/' ) { // checking if it's not a 
                c = getc(stream);
                if ( c == '>' ) {
                    --counter;
                    break;
                }
                putc( '/', out );
                putc( c, out );
            } else if ( c == '"' ) { // not parsing strings put in quotation marks
                do {
                    putc( c, out ); c = getc( stream );
                    if ( c == '\' ) {
                        putc( c, out ); c = getc( stream );
                        if ( c == '"' ) {
                            putc( c, out ); c = getc( stream );
                        }
                    }
                } while ( c != '"' );
                putc( c, out );
            } else if ( c == '>' ) { // end of tag
                break;
            } else // standard procedure
                putc( c, out );
        } while ( true );
        putc( '
', out );
    } while (true);
    fprintf( out, "Counter: %d", counter );
}

int main() {
    const char *name = "rfb.html";
    const char *oname = "out.txt";
    FILE *file = fopen(name, "r");
    FILE *out = fopen(oname, "w");
    fun( file, out );
    return 0;
}

kassak · Accepted Answer

HTML != XML Tags could be non-closed, for example is considered equal to

How to create a html tree?

Answers (2)

Related Questions