Reputation: 1645
I have a file with 20 fields as headers in the first row. The remaining rows have unequal number of fields, some of the rows have more columns than the headers. When i tried to read it using read.delim(), it reads the data without error but the total row count is more than the original number.
Here are a few lines of the file:
Chromosome Position SNPid Reference Alternate QUAL Homozygosity Tool Depth MappingQuality EFFECT IMPACT FUNCTIONAL_CLASS CODON_CHANGE AMINO_ACID_CHANGE GENE_NAME GENE_BIOTYPE GENE_CODING TRANSCRIPT_ID EXON_ID
chr1 403111 . G A 24 het SAM 20 55 INTERGENIC MODIFIER _ _ _ _ _ _ _ _ _
chr1 602567 rs21953190 A G 3265.77 hom GATKSAM 91 58.46 SYNONYMOUS_CODING LOW SILENT gaT/gaC D1034 ADNP2 protein_coding CODING ENSCAFT00000000008 5 _
chr1 604894 rs21953191 A G 2869.77 hom GATKSAM 77 59.70 NON_SYNONYMOUS_CODING MODERATE MISSENSE Ttt/Ctt F259L ADNP2 protein_coding CODING ENSCAFT00000000008 5 _
chr1 758630 . T TC 1531.73 hom GATKSAM 38 46.20 INTRON MODIFIER _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 2 _
chr1 800715 . C CT 514.73 hom GATKSAM 13 60.00 INTRON MODIFIER _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 6 ,SPLICE_SITE_ACCEPTOR HIGH _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 7 ,SPLICE_SITE_DONOR HIGH _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 6 _
chr1 1104035 rs21966859 G A 3803.77 hom GATKSAM 97 57.97 INTRON MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000000013 2 ,INTRON MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000036234 2 _
chr1 1120994 . CGCG C 604.73 hom GATKSAM 21 56.55 INTERGENIC MODIFIER _ _ _ _ _ _ _ _ ,UPSTREAM MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000000013 _ ,UPSTREAM MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000036234 _ _
chr1 1136916 rs21935602 G A 3899.77 hom GATKSAM 101 59.17 DOWNSTREAM MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000000014 _ ,DOWNSTREAM MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000042968 _ ,UTR_3_PRIME MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000046825 29 _
There are 9 rows in the file.But when it is read in R and the number of rows are counted it shows as 12.
read.delim("test.txt",header=T,sep='\t')->data
nrow(data)
Could someone help, to read the data properly?
Below is the output from dput(data)
> dput(data)
structure(list(Chromosome = structure(c(3L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L, 2L, 3L, 2L), .Label = c("HIGH", "MODIFIER", "chr1"
), class = "factor"), Position = structure(c(4L, 5L, 6L, 7L,
8L, 9L, 1L, 9L, 2L, 9L, 3L, 9L), .Label = c("1104035", "1120994",
"1136916", "403111", "602567", "604894", "758630", "800715",
"_"), class = "factor"), SNPid = structure(c(1L, 4L, 5L, 1L,
1L, 2L, 6L, 2L, 1L, 2L, 3L, 2L), .Label = c(".", "_", "rs21935602",
"rs21953190", "rs21953191", "rs21966859"), class = "factor"),
Reference = structure(c(4L, 1L, 1L, 5L, 2L, 6L, 4L, 6L, 3L,
6L, 4L, 6L), .Label = c("A", "C", "CGCG", "G", "T", "_"), class = "factor"),
Alternate = structure(c(1L, 5L, 5L, 8L, 4L, 7L, 1L, 6L, 3L,
6L, 1L, 2L), .Label = c("A", "ATP9B", "C", "CT", "G", "NFATC1",
"PQLC1", "TC"), class = "factor"), QUAL = structure(c(2L,
4L, 3L, 1L, 7L, 9L, 5L, 9L, 8L, 9L, 6L, 9L), .Label = c("1531.73",
"24", "2869.77", "3265.77", "3803.77", "3899.77", "514.73",
"604.73", "protein_coding"), class = "factor"), Homozygosity = structure(c(2L,
3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L), .Label = c("CODING",
"het", "hom"), class = "factor"), Tool = structure(c(6L,
5L, 5L, 5L, 5L, 1L, 5L, 3L, 5L, 2L, 5L, 4L), .Label = c("ENSCAFT00000000011",
"ENSCAFT00000000013", "ENSCAFT00000036234", "ENSCAFT00000042968",
"GATKSAM", "SAM"), class = "factor"), Depth = structure(c(4L,
9L, 8L, 6L, 2L, 7L, 10L, 3L, 5L, 11L, 1L, 11L), .Label = c("101",
"13", "2", "20", "21", "38", "7", "77", "91", "97", "_"), class = "factor"),
MappingQuality = structure(c(5L, 8L, 10L, 4L, 11L, 1L, 7L,
12L, 6L, 2L, 9L, 3L), .Label = c(",SPLICE_SITE_DONOR", ",UPSTREAM",
",UTR_3_PRIME", "46.20", "55", "56.55", "57.97", "58.46",
"59.17", "59.70", "60.00", "_"), class = "factor"), EFFECT = structure(c(4L,
8L, 7L, 5L, 5L, 3L, 5L, 1L, 4L, 6L, 2L, 6L), .Label = c("",
"DOWNSTREAM", "HIGH", "INTERGENIC", "INTRON", "MODIFIER",
"NON_SYNONYMOUS_CODING", "SYNONYMOUS_CODING"), class = "factor"),
IMPACT = structure(c(4L, 2L, 3L, 4L, 4L, 5L, 4L, 1L, 4L,
5L, 4L, 5L), .Label = c("", "LOW", "MODERATE", "MODIFIER",
"_"), class = "factor"), FUNCTIONAL_CLASS = structure(c(4L,
3L, 2L, 4L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 4L), .Label = c("",
"MISSENSE", "SILENT", "_"), class = "factor"), CODON_CHANGE = structure(c(3L,
4L, 2L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L), .Label = c("",
"Ttt/Ctt", "_", "gaT/gaC"), class = "factor"), AMINO_ACID_CHANGE = structure(c(7L,
3L, 4L, 7L, 7L, 6L, 7L, 1L, 7L, 5L, 7L, 2L), .Label = c("",
"ATP9B", "D1034", "F259L", "NFATC1", "PQLC1", "_"), class = "factor"),
GENE_NAME = structure(c(6L, 2L, 2L, 5L, 5L, 7L, 4L, 1L, 6L,
7L, 3L, 7L), .Label = c("", "ADNP2", "ATP9B", "NFATC1", "PQLC1",
"_", "protein_coding"), class = "factor"), GENE_BIOTYPE = structure(c(3L,
4L, 4L, 4L, 4L, 2L, 4L, 1L, 3L, 2L, 4L, 2L), .Label = c("",
"CODING", "_", "protein_coding"), class = "factor"), GENE_CODING = structure(c(6L,
2L, 2L, 2L, 2L, 3L, 2L, 1L, 6L, 4L, 2L, 5L), .Label = c("",
"CODING", "ENSCAFT00000000011", "ENSCAFT00000036234", "ENSCAFT00000046825",
"_"), class = "factor"), TRANSCRIPT_ID = structure(c(8L,
4L, 4L, 5L, 5L, 3L, 6L, 1L, 8L, 8L, 7L, 2L), .Label = c("",
"29", "6", "ENSCAFT00000000008", "ENSCAFT00000000011", "ENSCAFT00000000013",
"ENSCAFT00000000014", "_"), class = "factor"), EXON_ID = structure(c(5L,
3L, 3L, 2L, 4L, 5L, 2L, 1L, 5L, 5L, 5L, 5L), .Label = c("",
"2", "5", "6", "_"), class = "factor"), X = structure(c(6L,
6L, 6L, 6L, 4L, 1L, 3L, 1L, 5L, 1L, 2L, 1L), .Label = c("",
",DOWNSTREAM", ",INTRON", ",SPLICE_SITE_ACCEPTOR", ",UPSTREAM",
"_"), class = "factor")), .Names = c("Chromosome", "Position",
"SNPid", "Reference", "Alternate", "QUAL", "Homozygosity", "Tool",
"Depth", "MappingQuality", "EFFECT", "IMPACT", "FUNCTIONAL_CLASS",
"CODON_CHANGE", "AMINO_ACID_CHANGE", "GENE_NAME", "GENE_BIOTYPE",
"GENE_CODING", "TRANSCRIPT_ID", "EXON_ID", "X"), class = "data.frame", row.names = c(NA,
-12L))
Upvotes: 1
Views: 945
Reputation: 263382
Looking at the data you can see that it is highly "mutated" with many fusion lines. These are in many cases signaled by the presence of commas. I think this data is in a different format than you expect. Your first element in the dput data was a factor with Chromosome values =c("HIGH", "MODIFIER", "chr1"). That's not a sensible result, pointing to a lack of understanding on your part about the organization of the original data. You should post the original text file somewhere that can be accessed over the Internet, so the original layout can be examined. In particular the tabs you think are the delimiters are either not there or are not being captured by the SO interface.
After being pointed to the data sample, which should have been put into the question body by you doing editing, try this to delete the comments that follow the commas:
datL <- readLines("~/Downloads/test.txt")
datLred <- gsub("[,].+$", "", datL)
read.delim(text=datLred)
> str(read.delim(text=datLred) )
'data.frame': 8 obs. of 21 variables:
$ Chromosome : Factor w/ 1 level "chr1": 1 1 1 1 1 1 1 1
$ Position : int 403111 602567 604894 758630 800715 1104035 1120994 1136916
$ SNPid : Factor w/ 5 levels ".","rs21935602",..: 1 3 4 1 1 5 1 2
$ Reference : Factor w/ 5 levels "A","C","CGCG",..: 4 1 1 5 2 4 3 4
$ Alternate : Factor w/ 5 levels "A","C","CT","G",..: 1 4 4 5 3 1 2 1
snipped remain columns
Upvotes: 2
Reputation: 226372
R thinks you have 21 rather than 20 fields per line (maybe there are trailing tabs on each line?), and your lines 6-9 have additional fields:
count.fields("test.txt",sep="\t")
## [1] 21 21 21 21 21 41 31 41 41
This confuses the heck out of read.delim
, which tries to guess what's going on from the first 5 lines (it shouldn't, but that's the way it is). You might think you could use fill=TRUE
to get around this, but you can't.
I tried using colClasses
along with fill=TRUE
to specify the field types (I used colClasses=rep("character",41)
but you can probably guess better than that), but it doesn't seem to work, probably because your header only has 21 columns.
The fread
function in the data.table
package can do a little better, but only if you tell it not to try to guess the format from lines after #5, and it discards the data in columns beyond 21.
library(data.table)
nrow(fread("test.txt",autostart=5)) ## 9
Hmm, even that doesn't quite work as expected (it doesn't pick up the header properly, even if I set header=TRUE
, probably because column 21 doesn't have a header field ... The bottom line is that you probably have to figure out what those extra fields are and do something more explicit with them (e.g. add header fields ...)
Basically, R expects your data to be pretty clean. It might be worth sending this example to the maintainer of the data.table
package, who is trying to make fread
be as robust as possible ... this would represent a challenge.
Upvotes: 2