Reputation: 1127
My dataset contains the following typos
unique(d$gender)
[1] "k" "kobieta" "M" "K" "m─Ö┼╝czyzna" "21" "m" "M─Ö┼╝czyzna"
> unique(d$age)
[1] 19 NA 21 20 30 32 22 25 29
Actually, rows with 21 for gender and NA for age have been switched and moreover, different naming have been used for gender variable (indeed, all the 'k' heading name corresponds to female 'F' and the heading one with 'm' stand for male 'M'). I've written down this command lines to fix this for gender variable:
> d$gender = ifelse(d$gender == 'K', 'F',
+ ifelse(d$gender =='kobieta', 'F', ifelse(d$gender == 'k', 'F',
+ ifelse(d$gender == "m-Ö++czyzna", 'M',ifelse(d$gender == '21', 'M',
+ ifelse(d$gender == 'm', 'M', ifelse(d$gender == 'M-Ö++czyzna', 'M',
+ ifelse(d$gender == 'M', 'M', 'M'))))))))
>
> unique(d$gender)
[1] "F" "M"
But I don't know how to do he same for age variable, neither if this method could be the right way. Anyone has any suggestions?
This is the dput() result:
dput(head(d,50))
structure(list(ID = c("P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323"), gender = c("F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F"), age = c(19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19), fixation_time = c(60,
60, 60, 60, 60, 70, 50, 50, 50, 70, 70, 60, 50, 60, 70, 70, 50,
70, 70, 60, 70, 50, 50, 50, 60, 70, 60, 50, 60, 70, 60, 70, 50,
60, 70, 50, 50, 70, 70, 70, 70, 50, 60, 50, 60, 60, 70, 50, 60,
60), block = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
t1.key = c("None", "None", "None", "space", "None", "space",
"None", "None", "None", "space", "None", "None", "space",
"None", "None", "space", "None", "None", "space", "None",
"space", "space", "space", "None", "None", "None", "space",
"space", "None", "None", "space", "None", "None", "None",
"None", "None", "None", "space", "space", "None", "None",
"None", "None", "space", "None", "None", "space", "None",
"space", "None"), T1.response = structure(c(1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
1L), .Label = c("0", "1"), class = "factor"), COND = c("NR",
"NR", "NR", "R", "NR", "R", "NR", "NR", "NR", "R", "NR",
"NR", "R", "NR", "NR", "R", "NR", "NR", "R", "NR", "R", "R",
"R", "NR", "NR", "NR", "R", "R", "NR", "NR", "R", "NR", "NR",
"NR", "NR", "NR", "NR", "R", "R", "NR", "NR", "NR", "NR",
"R", "NR", "NR", "R", "NR", "R", "NR"), T1.rt = c(NA, NA,
NA, 0.812299799988978, NA, 0.72336569998879, NA, NA, NA,
0.772733500052709, NA, NA, 0.606754800013732, NA, NA, 0.601030899968464,
NA, NA, 0.838272600027267, NA, 0.305548300035298, 0.849945599969942,
0.748269900039304, NA, NA, NA, 0.859215400007088, 0.95704890001798,
NA, NA, 0.874362500035204, NA, NA, NA, NA, NA, NA, 0.270455699996091,
0.75726039998699, NA, NA, NA, NA, 0.762694000033662, NA,
NA, 0.789715700026136, NA, 0.90579859999707, NA), CR.key = c("p",
"p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p",
"p", "o", "p", "i", "i", "h", "u", "i", "u", "o", "o", "p",
"p", "p", "o", "p", "i", "o", "p", "p", "p", "o", "o", "o",
"p", "i", "p", "p", "o", "o", "i", "i", "o", "o", "i", "i",
"u"), CR.rt = c(0.651771800010465, 0.585048799985088, 0.652350199990906,
0.69888829998672, 1.01917029998731, 0.550036200031173, 0.0361186999944039,
0.568817299965303, 0.452191599993966, 0.514980700041633,
0.619590600021184, 0.719264700019266, 0.466181399999186,
0.45217840000987, 0.668881699966732, 0.914478300022893, 1.01910460001091,
1.40315000002738, 1.69993370003067, 1.71914210001705, 1.29938790004235,
0.698139799991623, 0.848338100011461, 0.651829700043891,
0.486136299965438, 0.703567499993369, 0.76673849998042, 0.54929809999885,
0.718664799991529, 0.768383099988569, 0.898415500007104,
0.819344500021543, 0.61898209998617, 0.737225699995179, 1.03654629999073,
0.971092400024645, 1.4362695000018, 0.999490200018045, 0.932840399967972,
0.586312200000975, 0.786785800009966, 1.01987839996582, 0.93673920002766,
0.715710600023158, 0.819960499997251, 0.75370900001144, 0.818668299994897,
0.903600800025742, 1.1176545000053, 1.10352450003847), trial_num = c(0,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51,
52, 53), ldots = c(48, 48, 52, 55, 51, 51, 52, 49, 45, 55,
49, 49, 51, 49, 48, 52, 45, 49, 45, 55, 51, 48, 55, 51, 45,
45, 52, 48, 48, 48, 55, 51, 49, 48, 49, 51, 51, 55, 51, 49,
45, 55, 51, 55, 55, 52, 52, 48, 49, 52), rdots = c(52, 52,
48, 45, 49, 49, 48, 51, 55, 45, 51, 51, 49, 51, 52, 48, 55,
51, 55, 45, 49, 52, 45, 49, 55, 55, 48, 52, 52, 52, 45, 49,
51, 52, 51, 49, 49, 45, 49, 51, 55, 45, 49, 45, 45, 48, 48,
52, 51, 48), TASK = c("left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left"), T1.correct = structure(c(1L,
1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L), .Label = c("0", "1"), class = "factor"),
Go.Nogo..whether.a.person.should.respond. = c("NR", "NR",
"R", "R", "R", "R", "R", "NR", "NR", "R", "NR", "NR", "R",
"NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "R", "R",
"NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "NR",
"NR", "R", "R", "R", "R", "NR", "NR", "R", "R", "R", "R",
"R", "R", "NR", "NR", "R"), T1.ACC = structure(c(2L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 1L), .Label = c("0", "1"), class = "factor"), CR = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L,
2L, 2L, 5L, 1L, 2L, 1L, 3L, 3L, 4L, 4L, 4L, 3L, 4L, 2L, 3L,
4L, 4L, 4L, 3L, 3L, 3L, 4L, 2L, 4L, 4L, 3L, 3L, 2L, 2L, 3L,
3L, 2L, 2L, 1L), .Label = c("1", "2", "3", "4", "9"), class = "factor"),
difficulty = c("medium", "medium", "medium", "easy", "hard",
"hard", "medium", "hard", "easy", "easy", "hard", "hard",
"hard", "hard", "medium", "medium", "easy", "hard", "easy",
"easy", "hard", "medium", "easy", "hard", "easy", "easy",
"medium", "medium", "medium", "medium", "easy", "hard", "hard",
"medium", "hard", "hard", "hard", "easy", "hard", "hard",
"easy", "easy", "hard", "easy", "easy", "medium", "medium",
"medium", "hard", "medium")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
Upvotes: 0
Views: 153
Reputation: 1624
I'd like to introduce you to two things: the %in%
operator and the datastep()
function.
The %in%
operator lets you consolidate possible matches into a vector. So you can do x %in% c(y, z)
instead of ifelse(x == y, a, ifelse(x == z, a, x))
. It can greatly reduce the number of nested conditions.
Second, I wrote a package called libr that has a function called datastep()
that was specifically designed for cleaning data like you are trying to do. It is appropriate for situations when you have a lot of nested conditions and complicated logic. It loops through the data row by row, lets you examine the values on each row, and create new values based on the values of a particular row. The best thing is you can nest the conditionals as deep as you want and still read them.
Here is an example using both %in%
and the datastep()
function:
library(libr)
# Sample data
d <- data.frame(gender = c("k", "kobieta", "M", "K", "m─Ö┼╝czyzna", "21", "m", "M─Ö┼╝czyzna", "F"),
age = c(19, NA, 21, 20, 30, 32, 22, 25, 29))
# Define datastep
d2 <- datastep(d, {
if (gender %in% c('K', 'k', 'kobieta', 'f')) {
gender_corrected <- 'F'
} else if (gender %in% c('m', 'm─Ö┼╝czyzna', 'M─Ö┼╝czyzna')) {
gender_corrected <- 'M'
} else if (gender == 21) {
gender_corrected <- 'M'
} else {
gender_corrected <- gender
}
if (is.na(age)) {
# Do something
} else {
if (age < 10) {
# Do something else
} else {
age_corrected <- age
}
}
})
So the result d2 would look like this:
> d2
gender age age_corrected gender_corrected
1 k 19 19 F
2 kobieta NA NA F
3 M 21 21 M
4 K 20 20 F
5 m-Ö++czyzna 30 30 M
6 21 32 32 M
7 m 22 22 M
8 M-Ö++czyzna 25 25 M
9 F 29 29 F
Upvotes: 1
Reputation: 325
I don't know which is the problem with the Age. But the ifelse statement can be rewritten in the next way:
If there are no anomalies in the d$gender
field:
d$gender2 = ifelse(tolower(substr(d$gender,1,1)) == "k", "F", "M")
If there are anomalies in the d$gender
field:
d$gender2 = ifelse(tolower(substr(d$gender,1,1)) == "k", "F",
ifelse(tolower(substr(d$gender,1,1)) == "m" | d$gender == "21", "M", "Other")
I think that is a more comfortable method. and you could use some variation like this.
In the case of the Age I don't know what you want to do.
Upvotes: 1