Reputation: 845
I have a xml results file (gzipped) that I want to output as a nice tab-separated table.
I came up with a script that should do that job (code see below).
BUT: When some of the sections in the file, e.g. Hamap
, are not present, then the table is shifted and no hyphen is printed when the field should be empty. How do I have to modify the script so that fields that are absent are also outputted as empty?
I can post two examples of my xml file in case you wish!
use warnings;
use strict;
use IO::Compress::Gzip;
my $input_file = $ARGV[0];
my $output_file = "$ARGV[0]".".tsv";
$output_file =~ s/\.gz//;
my $i = 0;
my $query; my %annotation;
open (IN,"gzip -dc $input_file |") or die "\nCould not open $input_file\n";
while (<IN>) {
if ( $_ =~ /\<xref desc=\"(.*)\" db=\"(.*)\" id=\"(.*)\" name=\"(.*)\"\/\>/) {
$query = $3;
$annotation{$query}{"order"} = $i;
$annotation{$query}{"Pfam"} = "-";
$annotation{$query}{"TIGRFAM"} = "-";
$annotation{$query}{"Gene3D"} = "-";
$annotation{$query}{"PANTHER"} = "-";
$annotation{$query}{"ProSiteProfiles"} = "-";
$annotation{$query}{"Hamap"} = "-";
$annotation{$query}{"SUPERFAMILY"} = "-";
$annotation{$query}{"PRINTS"} = "-";
$annotation{$query}{"PIRSF"} = "-";
$annotation{$query}{"SMART"} = "-";
$annotation{$query}{"GO_BIO"} = "-";
$annotation{$query}{"GO_MOL"} = "-";
$annotation{$query}{"GO_CEL"} = "-";
$annotation{$query}{"IPRO"} = "-";
$annotation{$query}{"pathway"} = "-";
$annotation{$query}{"CDD"} = "-";
$annotation{$query}{"MobiDBLite"} = "-";
$i++;
} elsif ($_ =~ /\<xref/){
if ($_ =~ /id=\"(.*)\"/){
$query=$1;
$annotation{$query}{"order"} = $i;
$annotation{$query}{"Pfam"} = "-";
$annotation{$query}{"TIGRFAM"} = "-";
$annotation{$query}{"Gene3D"} = "-";
$annotation{$query}{"PANTHER"} = "-";
$annotation{$query}{"ProSiteProfiles"} = "-";
$annotation{$query}{"Hamap"} = "-";
$annotation{$query}{"SUPERFAMILY"} = "-";
$annotation{$query}{"PRINTS"} = "-";
$annotation{$query}{"PIRSF"} = "-";
$annotation{$query}{"SMART"} = "-";
$annotation{$query}{"GO_BIO"} = "-";
$annotation{$query}{"GO_MOL"} = "-";
$annotation{$query}{"GO_CEL"} = "-";
$annotation{$query}{"IPRO"} = "-";
$annotation{$query}{"pathway"} = "-";
$annotation{$query}{"CDD"} = "-";
$annotation{$query}{"MobiDBLite"} = "-";
$i++;
}
} elsif ( $_ =~ /\<signature ac=\"(PF.*)\" desc=\"(.*)\" name=\"(.*)\">/ ) { # Pfam
if ($annotation{$query}{"Pfam"} eq "-"){
$annotation{$query}{"Pfam"} = "$1: $2";
}else{
$annotation{$query}{"Pfam"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(TIGR.*)\" desc=\"(.*)\" name=\"(.*)\">/ ) { # TIGRFAM
if ($annotation{$query}{"TIGRFAM"} eq "-"){
$annotation{$query}{"TIGRFAM"} = "$1: $2";
}else{
$annotation{$query}{"TIGRFAM"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(G3DSA:.*)\">/ ) { # Gene3D
if ($annotation{$query}{"Gene3D"} eq "-"){
$annotation{$query}{"Gene3D"} = "$1";
}else{
$annotation{$query}{"Gene3D"} .= "; $1";
}
} elsif ( $_ =~ /\<signature ac=\"(PTHR.*)\" name=\"(.*)\"\>/ ) { # PANTHER
if ($annotation{$query}{"PANTHER"} eq "-"){
$annotation{$query}{"PANTHER"} = "$1: $2";
}else{
$annotation{$query}{"PANTHER"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(PS.*)\" desc=\"(.*)\">/ ) { # ProSiteProfiles
if ($annotation{$query}{"ProSiteProfiles"} eq "-"){
$annotation{$query}{"ProSiteProfiles"} = "$1: $2";
}else{
$annotation{$query}{"ProSiteProfiles"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(MF.*)\" desc=\"(.*)\" name=\"(.*)\">/ ) { # Hamap
if ($annotation{$query}{"Hamap"} eq "-"){
$annotation{$query}{"Hamap"} = "$1: $2";
}else{
$annotation{$query}{"Hamap"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(SSF.*)\" name=\"(.*)\">/ ) { # SUPERFAMILY
if ($annotation{$query}{"SUPERFAMILY"} eq "-"){
$annotation{$query}{"SUPERFAMILY"} = "$1: $2";
}else{
$annotation{$query}{"SUPERFAMILY"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(PR.*)\" desc=\"(.*)\" name=\"(.*)\"\>/ ) { # PRINTS
if ($annotation{$query}{"PRINTS"} eq "-"){
$annotation{$query}{"PRINTS"} = "$1: $2";
}else{
$annotation{$query}{"PRINTS"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(PIRSF.*)\" name=\"(.*)\">/ ) { # PIRSF
if ($annotation{$query}{"PIRSF"} eq "-"){
$annotation{$query}{"PIRSF"} = "$1: $2";
}else{
$annotation{$query}{"PIRSF"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(SM.*)\" desc=\"(.*)\" name=\"(.*)\"\>/ ) { # SMART
if ($annotation{$query}{"SMART"} eq "-"){
$annotation{$query}{"SMART"} = "$1: $2";
}else{
$annotation{$query}{"SMART"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(cd.*)\" desc=\"(.*)\" name=\"(.*)\"\>/ ) { # CDD
if ($annotation{$query}{"CDD"} eq "-"){
$annotation{$query}{"CDD"} = "$1: $2";
}else{
$annotation{$query}{"CDD"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<signature ac=\"(mobidb-lite.*)\" desc=\"(.*)\" name=\"(.*)\"\>/ ) { # MobiDBLite
if ($annotation{$query}{"MobiDBLite"} eq "-"){
$annotation{$query}{"MobiDBLite"} = "$1: $2";
}else{
$annotation{$query}{"MobiDBLite"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<go-xref category="BIOLOGICAL_PROCESS" db="GO" id="(GO:.*)" name="(.*)"\/>/ ) {
if ($annotation{$query}{"GO_BIO"} eq "-"){
$annotation{$query}{"GO_BIO"} = "$1: $2";
}else{
$annotation{$query}{"GO_BIO"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<go-xref category="MOLECULAR_FUNCTION" db="GO" id="(GO:.*)" name="(.*)"\/>/ ) {
if ($annotation{$query}{"GO_MOL"} eq "-"){
$annotation{$query}{"GO_MOL"} = "$1: $2";
}else{
$annotation{$query}{"GO_MOL"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<go-xref category="CELLULAR_COMPONENT" db="GO" id="(GO:.*)" name="(.*)"\/>/ ) {
if ($annotation{$query}{"GO_CEL"} eq "-"){
$annotation{$query}{"GO_CEL"} = "$1: $2";
}else{
$annotation{$query}{"GO_CEL"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<pathway-xref db="(.*)" id="(.*)" name="(.*)"\/>/ ) {
if ($annotation{$query}{"pathway"} eq "-"){
$annotation{$query}{"pathway"} = "$1-$2: $3";
} elsif ($annotation{$query}{"pathway"} =~ /$2/) {
next;
}else{
$annotation{$query}{"pathway"} .= "; $1-$2: $3";
}
} elsif ( $_ =~ /\<entry ac="(IP.*)" desc="(.*)" name="(.*)" type="(.*)">/ ) {
if ($annotation{$query}{"IPRO"} eq "-"){
$annotation{$query}{"IPRO"} = "$1: $2";
} elsif ($annotation{$query}{"IPRO"} =~ /$1/) {
next;
}else{
$annotation{$query}{"IPRO"} .= "; $1: $2";
}
} elsif ( $_ =~ /\<\/protein\-matches\>/ ) {
}
}
close (IN);
open (OUT, ">$output_file") or die "\nCould not open $output_file\n";
print OUT "Query\tInterPro Entry\tInterPro: pathways\tInterPro: GO terms - Molecular Function\tInterPro: GO terms - Biological Process\tInterPro: GO terms - Cellular Component\t";
print OUT "InterPro: Pfam\tInterPro: TIGRFAM\tInterPro: PANTHER\tInterPro: ProSiteProfiles\tInterPro: Hamap\t";
print OUT "InterPro: PIRSF\tInterPro: Gene3D\tInterPro: SUPERFAMILY\tInterPro: PRINTS\tInterPro: SMART\tInterPro: CDD\tInterPro: MobiDBLite\n";
foreach my $query (sort {$annotation{$a}{"order"} <=> $annotation{$b}{"order"}} keys %annotation ) {
print OUT $query,"\t",$annotation{$query}{"IPRO"},"\t",$annotation{$query}{"pathway"},"\t",$annotation{$query}{"GO_MOL"},"\t",$annotation{$query}{"GO_BIO"},"\t",$annotation{$query}{"GO_CEL"},"\t";
print OUT $annotation{$query}{"Pfam"},"\t",$annotation{$query}{"TIGRFAM"},"\t",$annotation{$query}{"PANTHER"},"\t";
print OUT $annotation{$query}{"ProSiteProfiles"},"\t",$annotation{$query}{"Hamap"},"\t",$annotation{$query}{"PIRSF"},"\t";
print OUT $annotation{$query}{"Gene3D"},"\t",$annotation{$query}{"SUPERFAMILY"},"\t",$annotation{$query}{"PRINTS"},"\t";
print OUT $annotation{$query}{"SMART"},"\t",$annotation{$query}{"CDD"},"\t",$annotation{$query}{"MobiDBLite"},"\n";
}
close (OUT);
exit 1;
UPDATE: Here are two example files, the first one works as expected, but the second does not work properly (there is a shift in the fields).
Desired output of example 2:
Query InterPro Entry InterPro: pathways InterPro: GO terms - Molecular Function InterPro: GO terms - Biological Process InterPro: GO terms - Cellular Component InterPro: Pfam InterPro: TIGRFAM InterPro: PANTHER InterPro: ProSiteProfiles InterPro: Hamap InterPro: PIRSF InterPro: Gene3D InterPro: SUPERFAMILY InterPro: PRINTS InterPro: SMART InterPro: CDD InterPro: MobiDBLite
id_10005" name="id_10005 - - - - - PF13614: AAA domain - PTHR13696:SF85: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED - G3DSA:3.40.50.300 SSF52540: P-loop containing nucleoside triphosphate hydrolases - - - - - mobidb-lite: consensus disorder prediction; mobidb-lite: consensus disorder prediction
id_10004" name="id_10004 - - - - - PF13614: AAA domain - PTHR13696:SF86: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED - G3DSA:3.40.50.300 SSF52540: P-loop containing nucleoside triphosphate hydrolases - - - - cd02042: ParA -
Upvotes: 1
Views: 244
Reputation: 5962
Your code is quite long, so my answer will only present the gist of the idea with a few example items processed. But it should give you a starting point how to process the other items in your XML data.
#!/usr/bin/perl
use strict;
use warnings;
use XML::LibXML;
# XML namespace
use constant DEFAULT_XMLNS => 'http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5';
my $doc;
eval {
$doc = XML::LibXML->load_xml(IO => \*STDIN);
};
die "XML parser error: $@\n"
if $@;
# initialize XPath context
# NOTE: all nodes without NS must use default: prefix!
my $xpc = XML::LibXML::XPathContext->new();
$xpc->registerNs('default', DEFAULT_XMLNS);
# Signature processors - code
sub processor_just_primary {
my($protein, $key, $primary) = @_;
push(@{ $protein->{$key} }, $primary);
}
sub processor_primary_and_first_attr {
my($protein, $key, $primary, $attrs) = @_;
push(@{ $protein->{$key} }, "${primary}: " . $attrs->[0]);
}
# Signature processors - map key, "ac" identifier, attributes, code
my %signature_processors = (
Gene3D => {
id => qr/^G3DSA:(.+)/,
attr => [],
code => \&processor_just_primary,
},
Hamap => {
id => qr/^(MF.+)/,
attr => [ qw{desc name} ],
code => \&processor_primary_and_first_attr,
},
PANTHER => {
id => qr/^(PTHR.+)/,
attr => [ qw{name} ],
code => \&processor_primary_and_first_attr,
},
TIGRFAM => {
id => qr/^(TIGR.+)/,
attr => [ qw{desc name} ],
code => \&processor_primary_and_first_attr,
},
);
my @proteins;
foreach my $protein_node ($xpc->findnodes('//default:protein', $doc)) {
# search <xref> node downwards from <protein> node
my @xrefs = $xpc->findnodes('./default:xref', $protein_node)
or die "Can't find xref node for protein " . $protein_node->toString() . "\n";
my $id = $xrefs[0]->getAttribute('id')
or die "Can't get attribute 'id' for protein " . $xrefs[0]->toString() . "\n";
# initialize new protein
# NOTE: a key with an undefined value means "not found" -> empty column
my %protein = map { ($_ => undef) } keys %signature_processors;
$protein{ID} = $id;
push(@proteins, \%protein);
# fill protein with signature matches - searching nodes downwards from <protein/matches> node
foreach my $signature ($xpc->findnodes('./default:matches//default:signature', $protein_node)) {
my($attr_ac) = $signature->getAttribute('ac')
or die "Can't get attribute 'ac' for XML node " . $signature->toString() . "\n";
while (my($key, $processor) = each %signature_processors) {
my($primary) = ($attr_ac =~ $processor->{id})
or next;
# additional attributes
my @attrs;
foreach my $attr (@{ $processor->{attr} }) {
my($value) = $signature->getAttribute($attr)
or die "Can't get attribute '${attr}' for XML node " . $signature->toString() . "\n";
push(@attrs, $value);
}
# call processor
$processor->{code}->(\%protein, $key, $primary, \@attrs);
}
}
}
my @key_order = qw(
TIGRFAM
Hamap
PANTHER
Gene3D
);
sub dump_row(@) {
print join("\t", @_), "\n";
}
dump_row('ID', @key_order);
foreach my $protein (@proteins) {
my @columns =
map { $_ ? join('; ', @{ $_ }) : '-' } # handle empty columns
@{ $protein }{@key_order};
dump_row($protein->{ID}, @columns);
}
exit 0;
Test output for example 1:
$ perl dummy.pl <Downloads/c73whyqk.txt
ID TIGRFAM Hamap PANTHER Gene3D
id_10002 - - - -
id_10001 TIGR00621: ssb: single-stranded DNA-binding protein MF_00984: Single-stranded DNA-binding protein. PTHR10302:SF0: SINGLE-STRANDED DNA-BINDING PROTEIN, MITOCHONDRIAL; PTHR10302: SINGLE-STRANDED DNA-BINDING PROTEIN 2.40.50.140
id_10003 - - PTHR34107: FAMILY NOT NAMED 3.90.1570.10
Test output for example 2:
$ perl dummy.pl <dummy.xml
ID TIGRFAM Hamap PANTHER Gene3D
id_10005 - - PTHR13696:SF85: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED 3.40.50.300
id_10004 - - PTHR13696:SF86: SUBFAMILY NOT NAMED; PTHR13696: FAMILY NOT NAMED 3.40.50.300
BONUS CODE: as the question is also tagged with csv I'll add the changes required to generate CSV output:
#!/usr/bin/perl
use strict;
use warnings;
use Text::CSV;
use XML::LibXML;
# ... the main code is left unchanged ...
my $csv = Text::CSV->new()
or die "Cannot use CSV: " . Text::CSV->error_diag() . "\n";
$csv->eol("\n");
sub dump_row(@) {
$csv->print(\*STDOUT, \@_);
}
dump_row('ID', @key_order);
foreach my $protein (@proteins) {
my @columns =
map { $_ ? join('; ', @{ $_ }) : '' } # handle empty columns
@{ $protein }{@key_order};
dump_row($protein->{ID}, @columns);
}
exit 0;
UPDATE 2: it turns out that loading the TSV version of the original code can lead to problem when your CSV importer doesn't have an option to disable ,
(comma) and ;
(semicolon) as separators. The original code should therefore be rewritten to use Text::CSV instead, which does proper quoting and therefore avoids such problems.
I've also added some sanity check code for @key_order
vs. %signature_processors
.
# sanity checks
die "\@keyorder has keys not in \%signature_processors!\n"
if grep { not exists $signature_processors{$_} } @key_order;
{
my %keys = map { ($_ => 1) } @key_order;
die "\%signature_processors has keys not in \@keyorder!\n"
if grep { not exists $keys{$_} } keys %signature_processors;
}
my $csv = Text::CSV->new({
binary => 1,
eol => "\n",
# Select the output format by uncommenting *one* of the following
#sep_char => ',', # CSV - comma separated values
sep_char => "\t", # TSV - TAB separated values
})
or die "Cannot use CSV: " . Text::CSV->error_diag() . "\n";
sub dump_row(@) {
$csv->print(\*STDOUT, \@_);
}
Upvotes: 2