Parsing an xml file correctly in perl

Question

I have a xml results file (gzipped) that I want to output as a nice tab-separated table.

I came up with a script that should do that job (code see below).

BUT: When some of the sections in the file, e.g. Hamap, are not present, then the table is shifted and no hyphen is printed when the field should be empty. How do I have to modify the script so that fields that are absent are also outputted as empty?

I can post two examples of my xml file in case you wish!

use warnings;
use strict;
use IO::Compress::Gzip;

my $input_file  = $ARGV[0]; 
my $output_file = "$ARGV[0]".".tsv";
$output_file =~ s/\.gz//;
my $i = 0;
my $query; my %annotation;

open (IN,"gzip -dc $input_file |") or die "
Could not open $input_file
";


while () {
    if ( $_ =~ /\/) {

        $query = $3;
        $annotation{$query}{"order"}            = $i;
        $annotation{$query}{"Pfam"}             = "-";
        $annotation{$query}{"TIGRFAM"}          = "-";
        $annotation{$query}{"Gene3D"}           = "-";
        $annotation{$query}{"PANTHER"}          = "-";
        $annotation{$query}{"ProSiteProfiles"}  = "-";
        $annotation{$query}{"Hamap"}            = "-";
        $annotation{$query}{"SUPERFAMILY"}      = "-";
        $annotation{$query}{"PRINTS"}           = "-";
        $annotation{$query}{"PIRSF"}            = "-";
        $annotation{$query}{"SMART"}            = "-";
        $annotation{$query}{"GO_BIO"}           = "-";
        $annotation{$query}{"GO_MOL"}           = "-";
        $annotation{$query}{"GO_CEL"}           = "-";
        $annotation{$query}{"IPRO"}             = "-";
        $annotation{$query}{"pathway"}          = "-";
        $annotation{$query}{"CDD"}              = "-";
        $annotation{$query}{"MobiDBLite"}       = "-";


        $i++;


    } elsif ($_ =~ /\/ ) {                # Pfam 
            if ($annotation{$query}{"Pfam"} eq "-"){
                $annotation{$query}{"Pfam"} = "$1: $2";
        }else{
                $annotation{$query}{"Pfam"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {              # TIGRFAM
        if ($annotation{$query}{"TIGRFAM"} eq "-"){
                $annotation{$query}{"TIGRFAM"} = "$1: $2";
        }else{
                $annotation{$query}{"TIGRFAM"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {                                        # Gene3D
        if ($annotation{$query}{"Gene3D"} eq "-"){
                $annotation{$query}{"Gene3D"} = "$1";
            }else{
            $annotation{$query}{"Gene3D"} .= "; $1";
            } 

    } elsif ( $_ =~ /\/ ) {                           # PANTHER
        if ($annotation{$query}{"PANTHER"} eq "-"){
                $annotation{$query}{"PANTHER"} = "$1: $2";
        }else{
                $annotation{$query}{"PANTHER"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {                              # ProSiteProfiles
        if ($annotation{$query}{"ProSiteProfiles"} eq "-"){
                $annotation{$query}{"ProSiteProfiles"} = "$1: $2";
        }else{
                $annotation{$query}{"ProSiteProfiles"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {                # Hamap
        if ($annotation{$query}{"Hamap"} eq "-"){
                $annotation{$query}{"Hamap"} = "$1: $2";
        }else{
                $annotation{$query}{"Hamap"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {                             # SUPERFAMILY
        if ($annotation{$query}{"SUPERFAMILY"} eq "-"){
                $annotation{$query}{"SUPERFAMILY"} = "$1: $2";
            }else{
                $annotation{$query}{"SUPERFAMILY"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {               # PRINTS
        if ($annotation{$query}{"PRINTS"} eq "-"){
                $annotation{$query}{"PRINTS"} = "$1: $2";
        }else{
                $annotation{$query}{"PRINTS"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {                           # PIRSF
        if ($annotation{$query}{"PIRSF"} eq "-"){
                $annotation{$query}{"PIRSF"} = "$1: $2";
        }else{
                $annotation{$query}{"PIRSF"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {               # SMART
        if ($annotation{$query}{"SMART"} eq "-"){
                $annotation{$query}{"SMART"} = "$1: $2";
        }else{
                $annotation{$query}{"SMART"} .= "; $1: $2";
            }

    } elsif ( $_ =~ /\/ ) {               # CDD
        if ($annotation{$query}{"CDD"} eq "-"){
                $annotation{$query}{"CDD"} = "$1: $2";
        }else{
                $annotation{$query}{"CDD"} .= "; $1: $2";
            }

    } elsif ( $_ =~ /\/ ) {      # MobiDBLite
        if ($annotation{$query}{"MobiDBLite"} eq "-"){
                $annotation{$query}{"MobiDBLite"} = "$1: $2";
        }else{
                $annotation{$query}{"MobiDBLite"} .= "; $1: $2";
            }

    } elsif ( $_ =~ /\/ ) {
        if ($annotation{$query}{"GO_BIO"} eq "-"){
                $annotation{$query}{"GO_BIO"} = "$1: $2";
        }else{
                $annotation{$query}{"GO_BIO"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {
        if ($annotation{$query}{"GO_MOL"} eq "-"){
                $annotation{$query}{"GO_MOL"} = "$1: $2";
        }else{
                $annotation{$query}{"GO_MOL"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {
        if ($annotation{$query}{"GO_CEL"} eq "-"){
                $annotation{$query}{"GO_CEL"} = "$1: $2";
        }else{
                $annotation{$query}{"GO_CEL"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {
        if ($annotation{$query}{"pathway"} eq "-"){
                $annotation{$query}{"pathway"} = "$1-$2: $3";
        } elsif ($annotation{$query}{"pathway"} =~ /$2/) {
            next;
        }else{
                $annotation{$query}{"pathway"} .= "; $1-$2: $3";
            } 



    } elsif ( $_ =~ /\/ ) {
        if ($annotation{$query}{"IPRO"} eq "-"){
                $annotation{$query}{"IPRO"} = "$1: $2";
        } elsif ($annotation{$query}{"IPRO"} =~ /$1/) {
            next;
        }else{
                $annotation{$query}{"IPRO"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\/ ) {

    }

}
close (IN);

open (OUT, ">$output_file") or die "
Could not open $output_file
";
print OUT "Query	InterPro Entry	InterPro: pathways	InterPro: GO terms - Molecular Function	InterPro: GO terms - Biological Process	InterPro: GO terms - Cellular Component	";
print OUT "InterPro: Pfam	InterPro: TIGRFAM	InterPro: PANTHER	InterPro: ProSiteProfiles	InterPro: Hamap	";
print OUT "InterPro: PIRSF	InterPro: Gene3D	InterPro: SUPERFAMILY	InterPro: PRINTS	InterPro: SMART	InterPro: CDD	InterPro: MobiDBLite
";

foreach my $query (sort {$annotation{$a}{"order"} <=> $annotation{$b}{"order"}} keys %annotation ) {
    print OUT $query,"	",$annotation{$query}{"IPRO"},"	",$annotation{$query}{"pathway"},"	",$annotation{$query}{"GO_MOL"},"	",$annotation{$query}{"GO_BIO"},"	",$annotation{$query}{"GO_CEL"},"	";
    print OUT $annotation{$query}{"Pfam"},"	",$annotation{$query}{"TIGRFAM"},"	",$annotation{$query}{"PANTHER"},"	";
    print OUT $annotation{$query}{"ProSiteProfiles"},"	",$annotation{$query}{"Hamap"},"	",$annotation{$query}{"PIRSF"},"	";
    print OUT $annotation{$query}{"Gene3D"},"	",$annotation{$query}{"SUPERFAMILY"},"	",$annotation{$query}{"PRINTS"},"	";
    print OUT $annotation{$query}{"SMART"},"	",$annotation{$query}{"CDD"},"	",$annotation{$query}{"MobiDBLite"},"
";
}

close (OUT);


exit 1;

UPDATE: Here are two example files, the first one works as expected, but the second does not work properly (there is a shift in the fields).

example 1

example 2

Desired output of example 2:

Query   InterPro Entry  InterPro: pathways  InterPro: GO terms - Molecular Function InterPro: GO terms - Biological Process InterPro: GO terms - Cellular Component InterPro: Pfam  InterPro: TIGRFAM   InterPro: PANTHER   InterPro: ProSiteProfiles   InterPro: Hamap InterPro: PIRSF InterPro: Gene3D    InterPro: SUPERFAMILY   InterPro: PRINTS    InterPro: SMART InterPro: CDD   InterPro: MobiDBLite
id_10005" name="id_10005    -   -   -   -   -   PF13614: AAA domain -   PTHR13696:SF85: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED    -   G3DSA:3.40.50.300   SSF52540: P-loop containing nucleoside triphosphate hydrolases  -   -   -   -   -   mobidb-lite: consensus disorder prediction; mobidb-lite: consensus disorder prediction
id_10004" name="id_10004    -   -   -   -   -   PF13614: AAA domain -   PTHR13696:SF86: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED    -   G3DSA:3.40.50.300   SSF52540: P-loop containing nucleoside triphosphate hydrolases  -   -   -   -   cd02042: ParA   -

Stefan Becker · Accepted Answer

Your code is quite long, so my answer will only present the gist of the idea with a few example items processed. But it should give you a starting point how to process the other items in your XML data.

#!/usr/bin/perl
use strict;
use warnings;

use XML::LibXML;

# XML namespace
use constant DEFAULT_XMLNS => 'http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5';

my $doc;
eval {
    $doc = XML::LibXML->load_xml(IO => \*STDIN);
};
die "XML parser error: $@
"
    if $@;

# initialize XPath context
# NOTE: all nodes without NS must use default: prefix!
my $xpc = XML::LibXML::XPathContext->new();
$xpc->registerNs('default', DEFAULT_XMLNS);

# Signature processors - code
sub processor_just_primary {
    my($protein, $key, $primary) = @_;
    push(@{ $protein->{$key} }, $primary);
}
sub processor_primary_and_first_attr {
    my($protein, $key, $primary, $attrs) = @_;
    push(@{ $protein->{$key} }, "${primary}: " . $attrs->[0]);
}

# Signature processors - map key, "ac" identifier, attributes, code
my %signature_processors = (
    Gene3D  => {
        id   => qr/^G3DSA:(.+)/,
        attr => [],
        code => \&processor_just_primary,
    },
    Hamap   => {
        id   => qr/^(MF.+)/,
        attr => [ qw{desc name} ],
        code => \&processor_primary_and_first_attr,
    },
    PANTHER => {
        id   => qr/^(PTHR.+)/,
        attr => [ qw{name} ],
        code => \&processor_primary_and_first_attr,
    },
    TIGRFAM => {
        id   => qr/^(TIGR.+)/,
        attr => [ qw{desc name} ],
        code => \&processor_primary_and_first_attr,
    },
);

my @proteins;
foreach my $protein_node ($xpc->findnodes('//default:protein', $doc)) {
    # search  node downwards from  node
    my @xrefs  = $xpc->findnodes('./default:xref', $protein_node)
        or die "Can't find xref node for protein " . $protein_node->toString() . "
";
    my $id     = $xrefs[0]->getAttribute('id')
        or die "Can't get attribute 'id' for protein " . $xrefs[0]->toString() . "
";

    # initialize new protein
    # NOTE: a key with an undefined value means "not found" -> empty column
    my %protein  = map { ($_ => undef) } keys %signature_processors;
    $protein{ID} = $id;
    push(@proteins, \%protein);

    # fill protein with signature matches - searching nodes downwards from  node
    foreach my $signature ($xpc->findnodes('./default:matches//default:signature', $protein_node)) {
        my($attr_ac) = $signature->getAttribute('ac')
            or die "Can't get attribute 'ac' for XML node " . $signature->toString() . "
";

        while (my($key, $processor) = each %signature_processors) {
            my($primary)  = ($attr_ac =~ $processor->{id})
                or next;

            # additional attributes
            my @attrs;
            foreach my $attr (@{ $processor->{attr} }) {
                my($value) = $signature->getAttribute($attr)
                    or die "Can't get attribute '${attr}' for XML node " . $signature->toString() . "
";
                push(@attrs, $value);
            }

            # call processor
            $processor->{code}->(\%protein, $key, $primary, \@attrs);
        }
    }
}

my @key_order = qw(
    TIGRFAM
    Hamap
    PANTHER
    Gene3D
);

sub dump_row(@) {
    print join("	", @_), "
";
}

dump_row('ID', @key_order);
foreach my $protein (@proteins) {
    my @columns =
        map { $_ ? join('; ', @{ $_ }) : '-' } # handle empty columns
        @{ $protein }{@key_order};

    dump_row($protein->{ID}, @columns);
}

exit 0;

Test output for example 1:

$ perl dummy.pl



Test output for example 2:

$ perl dummy.pl 




BONUS CODE: as the question is also tagged with csv I'll add the changes required to generate CSV output:

#!/usr/bin/perl
use strict;
use warnings;

use Text::CSV;
use XML::LibXML;

# ... the main code is left unchanged ...

my $csv = Text::CSV->new()
    or die "Cannot use CSV: " . Text::CSV->error_diag() . "
";
$csv->eol("
");

sub dump_row(@) {
    $csv->print(\*STDOUT, \@_);
}

dump_row('ID', @key_order);
foreach my $protein (@proteins) {
    my @columns =
        map { $_ ? join('; ', @{ $_ }) : '' } # handle empty columns
        @{ $protein }{@key_order};

    dump_row($protein->{ID}, @columns);
}

exit 0;




UPDATE 2: it turns out that loading the TSV version of the original code can lead to problem when your CSV importer doesn't have an option to disable , (comma) and ; (semicolon) as separators. The original code should therefore be rewritten to use Text::CSV instead, which does proper quoting and therefore avoids such problems.

I've also added some sanity check code for @key_order vs. %signature_processors.

# sanity checks
die "\@keyorder has keys not in \%signature_processors!
"
    if grep { not exists $signature_processors{$_} } @key_order;
{
    my %keys = map { ($_ => 1) } @key_order;
    die "\%signature_processors has keys not in \@keyorder!
"
        if grep { not exists $keys{$_} } keys %signature_processors;
}

my $csv = Text::CSV->new({
    binary   => 1,
    eol      => "
",
    # Select the output format by uncommenting *one* of the following
    #sep_char => ',', # CSV - comma separated values
    sep_char => "	", # TSV - TAB separated values
})
    or die "Cannot use CSV: " . Text::CSV->error_diag() . "
";

sub dump_row(@) {
    $csv->print(\*STDOUT, \@_);
}

Parsing an xml file correctly in perl

Answers (1)

Related Questions