Jaap
Jaap

Reputation: 83215

problems in reading csv-file

I want to load a csv-file in R. However, some values are assigned to the wrong columns. I'm reading the file with:

bm <- read.csv("bm.csv", quote = "", fill=TRUE)
colnames(bm) <- c("id","user","url","title","description","favorite","private","date","removed","classifications")

which results in the following dataframe:

structure(list(id = structure(c(3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 
36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 
49L, 50L, 51L, 2L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 
61L), .Label = c("\"\"voeding;\"\"\"", "\"\"ziekte;\"\"\"", "\"33", 
"\"34", "\"35", "\"36", "\"37", "\"38", "\"39", "\"40", "\"41", 
"\"42", "\"43", "\"44", "\"45", "\"46", "\"47", "\"48", "\"49", 
"\"50", "\"51", "\"52", "\"53", "\"54", "\"55", "\"56", "\"57", 
"\"58", "\"59", "\"60", "\"61", "\"62", "\"63", "\"64", "\"65", 
"\"66", "\"67", "\"68", "\"69", "\"70", "\"71", "\"72", "\"73", 
"\"74", "\"75", "\"76", "\"77", "\"78", "\"79", "\"80", "\"81", 
"\"82", "\"83", "\"84", "\"85", "\"86", "\"87", "\"88", "\"89", 
"\"90", "\"91"), class = "factor"), user = structure(c(16L, 16L, 
17L, 17L, 17L, 17L, 19L, 1L, 15L, 16L, 7L, 16L, 10L, 9L, 14L, 
19L, 21L, 9L, 16L, 14L, 19L, 12L, 10L, 7L, 3L, 9L, 5L, 2L, 7L, 
3L, 10L, 14L, 13L, 16L, 17L, 15L, 8L, 18L, 21L, 10L, 4L, 11L, 
17L, 6L, 12L, 7L, 9L, 4L, 16L, 5L, 1L, 2L, 12L, 22L, 20L, 14L, 
24L, 4L, 10L, 23L, 12L), .Label = c("", "\"\"fxf01\"\"", "\"\"fxf02\"\"", 
"\"\"fxf03\"\"", "\"\"fxf04\"\"", "\"\"fxf05\"\"", "\"\"fxf06\"\"", 
"\"\"fxf08\"\"", "\"\"fxf09\"\"", "\"\"fxf10\"\"", "\"\"fxf11\"\"", 
"\"\"fxf15\"\"", "\"\"fxf16\"\"", "\"\"fxf17\"\"", "\"\"fxf18\"\"", 
"\"\"fxf19\"\"", "\"\"fxf20\"\"", "\"\"fxf21\"\"", "\"\"fxf22\"\"", 
"\"\"fxf23\"\"", "\"\"fxf24\"\"", "\"\"fxf25\"\"", "\"\"fxf26\"\"", 
"\"\"fxf27\"\""), class = "factor"), url = structure(c(14L, 14L, 
9L, 5L, 2L, 8L, 12L, 1L, 21L, 8L, 6L, 7L, 7L, 14L, 16L, 15L, 
11L, 21L, 6L, 6L, 11L, 6L, 8L, 17L, 20L, 19L, 11L, 7L, 3L, 18L, 
11L, 8L, 6L, 17L, 17L, 21L, 13L, 7L, 21L, 5L, 13L, 6L, 4L, 9L, 
11L, 16L, 5L, 15L, 16L, 10L, 1L, 6L, 15L, 5L, 21L, 19L, 18L, 
11L, 6L, 6L, 13L), .Label = c("", "\"\"http://gezondheid.blog.nl/overgewicht/2008/06/07/dik-zijn-heeft-veel-nadelen\"\"", 
"\"\"http://home.deds.nl/~obesitasinfo.nl/\"\"", "\"\"http://home.deds.nl/~obesitasinfo.nl/Behandeling.htm\"\"", 
"\"\"http://mens-en-gezondheid.infonu.nl/ziekten/18079-risicos-van-overgewicht-en-de-gevolgen-van-obesitas.html\"\"", 
"\"\"http://nl.wikipedia.org/wiki/Obesitas\"\"", "\"\"http://overgewicht.pilliewillie.nl/obesitas/behandeling.overgewicht.3.php\"\"", 
"\"\"http://www.erfelijkheid.nl/node/325\"\"", "\"\"http://www.gezonderafvallen.nl/page/938/overgewicht-als-gevolg-van-de-evolutie.html\"\"", 
"\"\"http://www.hely.net/gevolgen.html\"\"", "\"\"http://www.hely.net/oorzaken.html\"\"", 
"\"\"http://www.kiloafvallen.nl/\"\"", "\"\"http://www.nisb.nl/kennisplein-sport-bewegen/dossiers/bewegen-en-overgewicht/oorzaken-obesitas.html\"\"", 
"\"\"http://www.novarum.nl/eetproblemen/obesitas/signalen-en-gevolgen\"\"", 
"\"\"http://www.obesitas.azdamiaan.be/nl/index.aspx?n=280\"\"", 
"\"\"http://www.obesitaskliniek.nl/over-obesitas/\"\"", "\"\"http://www.obesitasvereniging.nl/\"\"", 
"\"\"http://www.sagbmaagband.nl/minder-gewicht/morbideobesitas.html\"\"", 
"\"\"http://www.tweestedenziekenhuis.nl/script/Template_SubsubMenu.asp?PageID=1144&SSMID=1247\"\"", 
"\"\"http://www.volkskrant.nl/vk/nl/2672/Wetenschap-Gezondheid/article/detail/3143483/2012/01/30/Balanstop-in-Madurodam-mueslireep-tegen-obesitas.dhtml\"\"", 
"\"\"http://www.zuivelengezondheid.nl/?pageID=332\"\""), class = "factor"), 
    title = structure(c(7L, 7L, 18L, 19L, 3L, 9L, 20L, 1L, 5L, 
    9L, 8L, 13L, 13L, 7L, 17L, 12L, 15L, 5L, 8L, 8L, 15L, 8L, 
    9L, 11L, 2L, 4L, 15L, 16L, 10L, 6L, 15L, 9L, 8L, 11L, 11L, 
    5L, 14L, 13L, 5L, 19L, 14L, 8L, 10L, 18L, 15L, 17L, 19L, 
    12L, 17L, 21L, 1L, 13L, 12L, 19L, 5L, 4L, 6L, 15L, 8L, 8L, 
    14L), .Label = c("", "\"\"Balanstop in Madurodam: mueslireep tegen obesitas - Wetenschap & Gezondheid - VK\"\"", 
    "\"\"Dik zijn heeft veel nadelen | Gezondheid\"\"", "\"\"Gevolgen obesitas - TweeSteden ziekenhuis\"\"", 
    "\"\"Gevolgen overgewicht | Zuivel en gezondheid\"\"", "\"\"Morbide obesitas / ernstig overgewicht - Website over de SAGB Maagband\"\"", 
    "\"\"Novarum - Wat zijn de signalen en gevolgen van obesitas?\"\"", 
    "\"\"Obesitas - Wikipedia\"\"", "\"\"Obesitas | erfelijkheid.nl\"\"", 
    "\"\"Obesitas info\"\"", "\"\"Obesitas Vereniging\"\"", "\"\"Obesitas- en overgewichtskliniek - Home - Algemene info - Oorzaken\"\"", 
    "\"\"OBESITAS\"\"", "\"\"Oorzaken obesitas\"\"", "\"\"oorzaken van obesitas\"\"", 
    "\"\"OU Bookmarktool\"\"", "\"\"Over obesitas | Nederlandse Obesitas Kliniek\"\"", 
    "\"\"Overgewicht als gevolg van de evolutie\"\"", "\"\"Risico\xe4\xf3\xbbs van overgewicht en de Gevolgen van Obesitas | Mens en Gezondheid: Ziekten\"\"", 
    "\"\"Snel Afvallen - Phase 2", "\"\"wat zijn de gevolgen van obesitas"
    ), class = "factor"), description = structure(c(4L, 4L, 4L, 
    4L, 4L, 4L, 3L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
    4L, 2L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("", 
    " morbide obesitas en overgewicht\"\"", " Tips & Trucs en nog veel meer\"\"", 
    "\"\"\"\""), class = "factor"), favorite = structure(c(2L, 
    2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 
    3L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 
    2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 1L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 3L, 3L, 2L
    ), .Label = c("", "\"\"\"\"", "\"\"1\"\""), class = "factor"), 
    private = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 3L, 1L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 1L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("", "\"\"\"\"", "\"\"1\"\""
    ), class = "factor"), date = structure(c(3L, 3L, 3L, 4L, 
    4L, 5L, 2L, 1L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 
    7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
    8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 
    2L, 1L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("", 
    "\"\"\"\"", "\"\"16-04-2012 9:09:45\"\"", "\"\"16-04-2012 9:09:46\"\"", 
    "\"\"16-04-2012 9:09:49\"\"", "\"\"16-04-2012 9:09:50\"\"", 
    "\"\"16-04-2012 9:09:51\"\"", "\"\"16-04-2012 9:09:52\"\"", 
    "\"\"16-04-2012 9:09:53\"\""), class = "factor"), removed = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 4L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
    ), .Label = c("", "\"\"\"\"", "\"\"16-04-2012 9:09:49\"\"", 
    "\"\"16-04-2012 9:09:53\"\""), class = "factor"), classifications = structure(c(8L, 
    8L, 7L, 8L, 6L, 5L, 3L, 1L, 2L, 5L, 8L, 7L, 7L, 8L, 8L, 5L, 
    8L, 6L, 6L, 8L, 4L, 8L, 5L, 8L, 8L, 8L, 8L, 2L, 8L, 8L, 7L, 
    5L, 2L, 5L, 8L, 8L, 4L, 7L, 8L, 8L, 8L, 8L, 4L, 2L, 7L, 8L, 
    8L, 7L, 8L, 3L, 1L, 2L, 7L, 6L, 6L, 6L, 8L, 7L, 6L, 7L, 7L
    ), .Label = c("", "\"\";\"\"\"", "\"\"\"\"", "\"\"beweging;\"\"\"", 
    "\"\"erfelijkheid;\"\"\"", "\"\"sociale aspecten;\"\"\"", 
    "\"\"voeding;\"\"\"", "\"\"ziekte;\"\"\""), class = "factor")), .Names = c("id", 
"user", "url", "title", "description", "favorite", "private", 
"date", "removed", "classifications"), class = "data.frame", row.names = c(NA, 
-61L))

I can see why its going wrong: the description column is mostly empty, but when not, every value is put in the next column and the value of the last column is put in the first column on the next row. Any ideas how to solve this?

Upvotes: 1

Views: 134

Answers (1)

llrs
llrs

Reputation: 3397

The file is very bad formatted, is separated by commas but quoted, and it also have the row.names. If you created this file print it again with the next command it will be easier to read.

write.csv("bm.csv", header=TRUE, row.names=FALSE)

And then read it with:

bm <- read.csv("bm.csv", sep = "\t", header=TRUE)

If you didn't create the file, this steps solve your problem (I think):

 bm <- read.csv("bm.csv", fill=TRUE, sep="\"")

If you look at the data now (head(bm)) it is full of empty columns. To select just the ones you want you can make a vector:

bm <- bm[c("user","url","title","favorite", "date", "classifications")]

And then change the names to whatever you want, with :

names(bm) <- c("user", "url", "title descriptions","favorite", "private date", "classifications")

Upvotes: 1

Related Questions