rororo
rororo

Reputation: 845

Parsing an xml file correctly in perl

I have a xml results file (gzipped) that I want to output as a nice tab-separated table.

I came up with a script that should do that job (code see below).

BUT: When some of the sections in the file, e.g. Hamap, are not present, then the table is shifted and no hyphen is printed when the field should be empty. How do I have to modify the script so that fields that are absent are also outputted as empty?

I can post two examples of my xml file in case you wish!

use warnings;
use strict;
use IO::Compress::Gzip;

my $input_file  = $ARGV[0]; 
my $output_file = "$ARGV[0]".".tsv";
$output_file =~ s/\.gz//;
my $i = 0;
my $query; my %annotation;

open (IN,"gzip -dc $input_file |") or die "\nCould not open $input_file\n";


while (<IN>) {
    if ( $_ =~ /\<xref desc=\"(.*)\" db=\"(.*)\" id=\"(.*)\" name=\"(.*)\"\/\>/) {

        $query = $3;
        $annotation{$query}{"order"}            = $i;
        $annotation{$query}{"Pfam"}             = "-";
        $annotation{$query}{"TIGRFAM"}          = "-";
        $annotation{$query}{"Gene3D"}           = "-";
        $annotation{$query}{"PANTHER"}          = "-";
        $annotation{$query}{"ProSiteProfiles"}  = "-";
        $annotation{$query}{"Hamap"}            = "-";
        $annotation{$query}{"SUPERFAMILY"}      = "-";
        $annotation{$query}{"PRINTS"}           = "-";
        $annotation{$query}{"PIRSF"}            = "-";
        $annotation{$query}{"SMART"}            = "-";
        $annotation{$query}{"GO_BIO"}           = "-";
        $annotation{$query}{"GO_MOL"}           = "-";
        $annotation{$query}{"GO_CEL"}           = "-";
        $annotation{$query}{"IPRO"}             = "-";
        $annotation{$query}{"pathway"}          = "-";
        $annotation{$query}{"CDD"}              = "-";
        $annotation{$query}{"MobiDBLite"}       = "-";


        $i++;


    } elsif ($_ =~ /\<xref/){
                if ($_ =~ /id=\"(.*)\"/){
                 $query=$1;
                 $annotation{$query}{"order"}            = $i;
                $annotation{$query}{"Pfam"}              = "-";
                $annotation{$query}{"TIGRFAM"}           = "-";
                $annotation{$query}{"Gene3D"}            = "-";
                $annotation{$query}{"PANTHER"}           = "-";
                $annotation{$query}{"ProSiteProfiles"}   = "-";
                $annotation{$query}{"Hamap"}             = "-";
                $annotation{$query}{"SUPERFAMILY"}       = "-";
                $annotation{$query}{"PRINTS"}            = "-";
                $annotation{$query}{"PIRSF"}             = "-";
                $annotation{$query}{"SMART"}             = "-";
                $annotation{$query}{"GO_BIO"}            = "-";
                $annotation{$query}{"GO_MOL"}            = "-";
                $annotation{$query}{"GO_CEL"}            = "-";
                $annotation{$query}{"IPRO"}              = "-";
                $annotation{$query}{"pathway"}           = "-";
                $annotation{$query}{"CDD"}               = "-";
                $annotation{$query}{"MobiDBLite"}        = "-";


                $i++;

                }


    } elsif ( $_ =~ /\<signature ac=\"(PF.*)\" desc=\"(.*)\" name=\"(.*)\">/ ) {                # Pfam 
            if ($annotation{$query}{"Pfam"} eq "-"){
                $annotation{$query}{"Pfam"} = "$1: $2";
        }else{
                $annotation{$query}{"Pfam"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<signature ac=\"(TIGR.*)\" desc=\"(.*)\" name=\"(.*)\">/ ) {              # TIGRFAM
        if ($annotation{$query}{"TIGRFAM"} eq "-"){
                $annotation{$query}{"TIGRFAM"} = "$1: $2";
        }else{
                $annotation{$query}{"TIGRFAM"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<signature ac=\"(G3DSA:.*)\">/ ) {                                        # Gene3D
        if ($annotation{$query}{"Gene3D"} eq "-"){
                $annotation{$query}{"Gene3D"} = "$1";
            }else{
            $annotation{$query}{"Gene3D"} .= "; $1";
            } 

    } elsif ( $_ =~ /\<signature ac=\"(PTHR.*)\" name=\"(.*)\"\>/ ) {                           # PANTHER
        if ($annotation{$query}{"PANTHER"} eq "-"){
                $annotation{$query}{"PANTHER"} = "$1: $2";
        }else{
                $annotation{$query}{"PANTHER"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<signature ac=\"(PS.*)\" desc=\"(.*)\">/ ) {                              # ProSiteProfiles
        if ($annotation{$query}{"ProSiteProfiles"} eq "-"){
                $annotation{$query}{"ProSiteProfiles"} = "$1: $2";
        }else{
                $annotation{$query}{"ProSiteProfiles"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<signature ac=\"(MF.*)\" desc=\"(.*)\" name=\"(.*)\">/ ) {                # Hamap
        if ($annotation{$query}{"Hamap"} eq "-"){
                $annotation{$query}{"Hamap"} = "$1: $2";
        }else{
                $annotation{$query}{"Hamap"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<signature ac=\"(SSF.*)\" name=\"(.*)\">/ ) {                             # SUPERFAMILY
        if ($annotation{$query}{"SUPERFAMILY"} eq "-"){
                $annotation{$query}{"SUPERFAMILY"} = "$1: $2";
            }else{
                $annotation{$query}{"SUPERFAMILY"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<signature ac=\"(PR.*)\" desc=\"(.*)\" name=\"(.*)\"\>/ ) {               # PRINTS
        if ($annotation{$query}{"PRINTS"} eq "-"){
                $annotation{$query}{"PRINTS"} = "$1: $2";
        }else{
                $annotation{$query}{"PRINTS"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<signature ac=\"(PIRSF.*)\" name=\"(.*)\">/ ) {                           # PIRSF
        if ($annotation{$query}{"PIRSF"} eq "-"){
                $annotation{$query}{"PIRSF"} = "$1: $2";
        }else{
                $annotation{$query}{"PIRSF"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<signature ac=\"(SM.*)\" desc=\"(.*)\" name=\"(.*)\"\>/ ) {               # SMART
        if ($annotation{$query}{"SMART"} eq "-"){
                $annotation{$query}{"SMART"} = "$1: $2";
        }else{
                $annotation{$query}{"SMART"} .= "; $1: $2";
            }

    } elsif ( $_ =~ /\<signature ac=\"(cd.*)\" desc=\"(.*)\" name=\"(.*)\"\>/ ) {               # CDD
        if ($annotation{$query}{"CDD"} eq "-"){
                $annotation{$query}{"CDD"} = "$1: $2";
        }else{
                $annotation{$query}{"CDD"} .= "; $1: $2";
            }

    } elsif ( $_ =~ /\<signature ac=\"(mobidb-lite.*)\" desc=\"(.*)\" name=\"(.*)\"\>/ ) {      # MobiDBLite
        if ($annotation{$query}{"MobiDBLite"} eq "-"){
                $annotation{$query}{"MobiDBLite"} = "$1: $2";
        }else{
                $annotation{$query}{"MobiDBLite"} .= "; $1: $2";
            }

    } elsif ( $_ =~ /\<go-xref category="BIOLOGICAL_PROCESS" db="GO" id="(GO:.*)" name="(.*)"\/>/ ) {
        if ($annotation{$query}{"GO_BIO"} eq "-"){
                $annotation{$query}{"GO_BIO"} = "$1: $2";
        }else{
                $annotation{$query}{"GO_BIO"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<go-xref category="MOLECULAR_FUNCTION" db="GO" id="(GO:.*)" name="(.*)"\/>/ ) {
        if ($annotation{$query}{"GO_MOL"} eq "-"){
                $annotation{$query}{"GO_MOL"} = "$1: $2";
        }else{
                $annotation{$query}{"GO_MOL"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<go-xref category="CELLULAR_COMPONENT" db="GO" id="(GO:.*)" name="(.*)"\/>/ ) {
        if ($annotation{$query}{"GO_CEL"} eq "-"){
                $annotation{$query}{"GO_CEL"} = "$1: $2";
        }else{
                $annotation{$query}{"GO_CEL"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<pathway-xref db="(.*)" id="(.*)" name="(.*)"\/>/ ) {
        if ($annotation{$query}{"pathway"} eq "-"){
                $annotation{$query}{"pathway"} = "$1-$2: $3";
        } elsif ($annotation{$query}{"pathway"} =~ /$2/) {
            next;
        }else{
                $annotation{$query}{"pathway"} .= "; $1-$2: $3";
            } 



    } elsif ( $_ =~ /\<entry ac="(IP.*)" desc="(.*)" name="(.*)" type="(.*)">/ ) {
        if ($annotation{$query}{"IPRO"} eq "-"){
                $annotation{$query}{"IPRO"} = "$1: $2";
        } elsif ($annotation{$query}{"IPRO"} =~ /$1/) {
            next;
        }else{
                $annotation{$query}{"IPRO"} .= "; $1: $2";
            } 

    } elsif ( $_ =~ /\<\/protein\-matches\>/ ) {

    }

}
close (IN);

open (OUT, ">$output_file") or die "\nCould not open $output_file\n";
print OUT "Query\tInterPro Entry\tInterPro: pathways\tInterPro: GO terms - Molecular Function\tInterPro: GO terms - Biological Process\tInterPro: GO terms - Cellular Component\t";
print OUT "InterPro: Pfam\tInterPro: TIGRFAM\tInterPro: PANTHER\tInterPro: ProSiteProfiles\tInterPro: Hamap\t";
print OUT "InterPro: PIRSF\tInterPro: Gene3D\tInterPro: SUPERFAMILY\tInterPro: PRINTS\tInterPro: SMART\tInterPro: CDD\tInterPro: MobiDBLite\n";

foreach my $query (sort {$annotation{$a}{"order"} <=> $annotation{$b}{"order"}} keys %annotation ) {
    print OUT $query,"\t",$annotation{$query}{"IPRO"},"\t",$annotation{$query}{"pathway"},"\t",$annotation{$query}{"GO_MOL"},"\t",$annotation{$query}{"GO_BIO"},"\t",$annotation{$query}{"GO_CEL"},"\t";
    print OUT $annotation{$query}{"Pfam"},"\t",$annotation{$query}{"TIGRFAM"},"\t",$annotation{$query}{"PANTHER"},"\t";
    print OUT $annotation{$query}{"ProSiteProfiles"},"\t",$annotation{$query}{"Hamap"},"\t",$annotation{$query}{"PIRSF"},"\t";
    print OUT $annotation{$query}{"Gene3D"},"\t",$annotation{$query}{"SUPERFAMILY"},"\t",$annotation{$query}{"PRINTS"},"\t";
    print OUT $annotation{$query}{"SMART"},"\t",$annotation{$query}{"CDD"},"\t",$annotation{$query}{"MobiDBLite"},"\n";
}

close (OUT);


exit 1; 

UPDATE: Here are two example files, the first one works as expected, but the second does not work properly (there is a shift in the fields).

example 1

example 2

Desired output of example 2:

Query   InterPro Entry  InterPro: pathways  InterPro: GO terms - Molecular Function InterPro: GO terms - Biological Process InterPro: GO terms - Cellular Component InterPro: Pfam  InterPro: TIGRFAM   InterPro: PANTHER   InterPro: ProSiteProfiles   InterPro: Hamap InterPro: PIRSF InterPro: Gene3D    InterPro: SUPERFAMILY   InterPro: PRINTS    InterPro: SMART InterPro: CDD   InterPro: MobiDBLite
id_10005" name="id_10005    -   -   -   -   -   PF13614: AAA domain -   PTHR13696:SF85: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED    -   G3DSA:3.40.50.300   SSF52540: P-loop containing nucleoside triphosphate hydrolases  -   -   -   -   -   mobidb-lite: consensus disorder prediction; mobidb-lite: consensus disorder prediction
id_10004" name="id_10004    -   -   -   -   -   PF13614: AAA domain -   PTHR13696:SF86: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED    -   G3DSA:3.40.50.300   SSF52540: P-loop containing nucleoside triphosphate hydrolases  -   -   -   -   cd02042: ParA   -

Upvotes: 1

Views: 244

Answers (1)

Stefan Becker
Stefan Becker

Reputation: 5962

Your code is quite long, so my answer will only present the gist of the idea with a few example items processed. But it should give you a starting point how to process the other items in your XML data.

#!/usr/bin/perl
use strict;
use warnings;

use XML::LibXML;

# XML namespace
use constant DEFAULT_XMLNS => 'http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5';

my $doc;
eval {
    $doc = XML::LibXML->load_xml(IO => \*STDIN);
};
die "XML parser error: $@\n"
    if $@;

# initialize XPath context
# NOTE: all nodes without NS must use default: prefix!
my $xpc = XML::LibXML::XPathContext->new();
$xpc->registerNs('default', DEFAULT_XMLNS);

# Signature processors - code
sub processor_just_primary {
    my($protein, $key, $primary) = @_;
    push(@{ $protein->{$key} }, $primary);
}
sub processor_primary_and_first_attr {
    my($protein, $key, $primary, $attrs) = @_;
    push(@{ $protein->{$key} }, "${primary}: " . $attrs->[0]);
}

# Signature processors - map key, "ac" identifier, attributes, code
my %signature_processors = (
    Gene3D  => {
        id   => qr/^G3DSA:(.+)/,
        attr => [],
        code => \&processor_just_primary,
    },
    Hamap   => {
        id   => qr/^(MF.+)/,
        attr => [ qw{desc name} ],
        code => \&processor_primary_and_first_attr,
    },
    PANTHER => {
        id   => qr/^(PTHR.+)/,
        attr => [ qw{name} ],
        code => \&processor_primary_and_first_attr,
    },
    TIGRFAM => {
        id   => qr/^(TIGR.+)/,
        attr => [ qw{desc name} ],
        code => \&processor_primary_and_first_attr,
    },
);

my @proteins;
foreach my $protein_node ($xpc->findnodes('//default:protein', $doc)) {
    # search <xref> node downwards from <protein> node
    my @xrefs  = $xpc->findnodes('./default:xref', $protein_node)
        or die "Can't find xref node for protein " . $protein_node->toString() . "\n";
    my $id     = $xrefs[0]->getAttribute('id')
        or die "Can't get attribute 'id' for protein " . $xrefs[0]->toString() . "\n";

    # initialize new protein
    # NOTE: a key with an undefined value means "not found" -> empty column
    my %protein  = map { ($_ => undef) } keys %signature_processors;
    $protein{ID} = $id;
    push(@proteins, \%protein);

    # fill protein with signature matches - searching nodes downwards from <protein/matches> node
    foreach my $signature ($xpc->findnodes('./default:matches//default:signature', $protein_node)) {
        my($attr_ac) = $signature->getAttribute('ac')
            or die "Can't get attribute 'ac' for XML node " . $signature->toString() . "\n";

        while (my($key, $processor) = each %signature_processors) {
            my($primary)  = ($attr_ac =~ $processor->{id})
                or next;

            # additional attributes
            my @attrs;
            foreach my $attr (@{ $processor->{attr} }) {
                my($value) = $signature->getAttribute($attr)
                    or die "Can't get attribute '${attr}' for XML node " . $signature->toString() . "\n";
                push(@attrs, $value);
            }

            # call processor
            $processor->{code}->(\%protein, $key, $primary, \@attrs);
        }
    }
}

my @key_order = qw(
    TIGRFAM
    Hamap
    PANTHER
    Gene3D
);

sub dump_row(@) {
    print join("\t", @_), "\n";
}

dump_row('ID', @key_order);
foreach my $protein (@proteins) {
    my @columns =
        map { $_ ? join('; ', @{ $_ }) : '-' } # handle empty columns
        @{ $protein }{@key_order};

    dump_row($protein->{ID}, @columns);
}

exit 0;

Test output for example 1:

$ perl dummy.pl <Downloads/c73whyqk.txt 
ID      TIGRFAM Hamap   PANTHER Gene3D
id_10002        -       -       -       -
id_10001        TIGR00621: ssb: single-stranded DNA-binding protein     MF_00984: Single-stranded DNA-binding protein.  PTHR10302:SF0: SINGLE-STRANDED DNA-BINDING PROTEIN, MITOCHONDRIAL; PTHR10302: SINGLE-STRANDED DNA-BINDING PROTEIN      2.40.50.140
id_10003        -       -       PTHR34107: FAMILY NOT NAMED     3.90.1570.10

Test output for example 2:

$ perl dummy.pl <dummy.xml
ID      TIGRFAM Hamap   PANTHER Gene3D
id_10005        -       -       PTHR13696:SF85: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED        3.40.50.300
id_10004        -       -       PTHR13696:SF86: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED        3.40.50.300

BONUS CODE: as the question is also tagged with I'll add the changes required to generate CSV output:

#!/usr/bin/perl
use strict;
use warnings;

use Text::CSV;
use XML::LibXML;

# ... the main code is left unchanged ...

my $csv = Text::CSV->new()
    or die "Cannot use CSV: " . Text::CSV->error_diag() . "\n";
$csv->eol("\n");

sub dump_row(@) {
    $csv->print(\*STDOUT, \@_);
}

dump_row('ID', @key_order);
foreach my $protein (@proteins) {
    my @columns =
        map { $_ ? join('; ', @{ $_ }) : '' } # handle empty columns
        @{ $protein }{@key_order};

    dump_row($protein->{ID}, @columns);
}

exit 0;

UPDATE 2: it turns out that loading the TSV version of the original code can lead to problem when your CSV importer doesn't have an option to disable , (comma) and ; (semicolon) as separators. The original code should therefore be rewritten to use Text::CSV instead, which does proper quoting and therefore avoids such problems.

I've also added some sanity check code for @key_order vs. %signature_processors.

# sanity checks
die "\@keyorder has keys not in \%signature_processors!\n"
    if grep { not exists $signature_processors{$_} } @key_order;
{
    my %keys = map { ($_ => 1) } @key_order;
    die "\%signature_processors has keys not in \@keyorder!\n"
        if grep { not exists $keys{$_} } keys %signature_processors;
}

my $csv = Text::CSV->new({
    binary   => 1,
    eol      => "\n",
    # Select the output format by uncommenting *one* of the following
    #sep_char => ',', # CSV - comma separated values
    sep_char => "\t", # TSV - TAB separated values
})
    or die "Cannot use CSV: " . Text::CSV->error_diag() . "\n";

sub dump_row(@) {
    $csv->print(\*STDOUT, \@_);
}

Upvotes: 2

Related Questions