Learner
Learner

Reputation: 757

how to remove data row wise with only a certain number of values

I have a data like this,

df <- structure(list(Data = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
23L, 8L, 12L), .Label = c("A", "B", "C", "D", "E", "F", "G", 
"GH", "H", "I", "J", "JJ", "K", "L", "M", "N", "O", "P", "Q", 
"S", "T", "U", "V"), class = "factor"), Case1 = c(0.775230796, 
0.752114939, 0.738305175, 0.579739531, 0.573781392, 0.572924713, 
0.563521221, 0.558172423, 0.557918102, 0.552505171, 0.551921725, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Case2 = c(NA, 
NA, 0.729029032, NA, NA, NA, NA, NA, 0.736282677, 0.702296369, 
NA, 0.736060259, 0.735161607, 0.735100052, 0.734870114, 0.732743364, 
0.703591649, NA, NA, NA, NA, NA, NA), Case3 = c(NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.735568109, 
NA, NA, NA, NA, NA), Case4 = c(0.713963088, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.781378904, 
0.769328289, NA, NA, NA), Case5 = c(NA, NA, NA, NA, 0.693759347, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.688396329, NA, NA, NA, 
NA, NA, 0.6781535, NA, NA), Case6 = c(0.795781477, 0.793446723, 
0.814514206, NA, 0.773564937, NA, NA, 0.783075476, NA, NA, NA, 
0.742827684, 0.796443568, NA, 0.75610514, 0.751162004, NA, NA, 
NA, NA, NA, 0.738104459, NA), Case7 = c(0.732114731, 0.720226731, 
0.782045984, NA, 0.717304483, NA, NA, 0.674992626, NA, NA, NA, 
NA, 0.71098987, NA, 0.676952218, NA, NA, NA, NA, NA, NA, NA, 
0.676754903)), class = "data.frame", row.names = c(NA, -23L))

I want to recognize which rows have only one value and them remove those that have 1 value or completely NA

I can remove those rows that are completely NA using the following

mydf<- df[rowSums(is.na(df)),]

however, I don't know how to remove those that have no values, or 1 value in each row

Upvotes: 0

Views: 70

Answers (2)

NM_
NM_

Reputation: 1999

To find the entries where all rows are NA, use the condition

rowSums(is.na(df[,-1])) == ncol(df[,-1])

To find the entries where there is only one value and the rest are NA, use

rowSums(!is.na(df[,-1])) == 1

Combine them to get rows where either condition is NOT true (i.e. remove rows where condition is true and keep rows where condition is not true). We can do this with

> df[ !(rowSums(is.na(df[,-1])) == ncol(df[,-1]) | rowSums(!is.na(df[,-1])) == 1), ]

   Data     Case1     Case2 Case3     Case4     Case5     Case6     Case7
1     A 0.7752308        NA    NA 0.7139631        NA 0.7957815 0.7321147
2     B 0.7521149        NA    NA        NA        NA 0.7934467 0.7202267
3     C 0.7383052 0.7290290    NA        NA        NA 0.8145142 0.7820460
5     E 0.5737814        NA    NA        NA 0.6937593 0.7735649 0.7173045
8     H 0.5581724        NA    NA        NA        NA 0.7830755 0.6749926
9     I 0.5579181 0.7362827    NA        NA        NA        NA        NA
10    J 0.5525052 0.7022964    NA        NA        NA        NA        NA
12    L        NA 0.7360603    NA        NA        NA 0.7428277        NA
13    M        NA 0.7351616    NA        NA        NA 0.7964436 0.7109899
15    O        NA 0.7348701    NA        NA 0.6883963 0.7561051 0.6769522
16    P        NA 0.7327434    NA        NA        NA 0.7511620        NA

Upvotes: 1

Andrew
Andrew

Reputation: 5138

Here is one way to do it in base.

> # Create a column (or you could just use a vector)
> # The [-1] removes the first column, data, from sum
> df$value_count <- rowSums(!is.na(df[-1])) 
> 
> # Subset for more than, e.g. 1, values
> df <- df[df$value_count > 1,]
> df
   Data     Case1     Case2 Case3     Case4     Case5     Case6     Case7 value_count
1     A 0.7752308        NA    NA 0.7139631        NA 0.7957815 0.7321147           4
2     B 0.7521149        NA    NA        NA        NA 0.7934467 0.7202267           3
3     C 0.7383052 0.7290290    NA        NA        NA 0.8145142 0.7820460           4
5     E 0.5737814        NA    NA        NA 0.6937593 0.7735649 0.7173045           4
8     H 0.5581724        NA    NA        NA        NA 0.7830755 0.6749926           3
9     I 0.5579181 0.7362827    NA        NA        NA        NA        NA           2
10    J 0.5525052 0.7022964    NA        NA        NA        NA        NA           2
12    L        NA 0.7360603    NA        NA        NA 0.7428277        NA           2
13    M        NA 0.7351616    NA        NA        NA 0.7964436 0.7109899           3
15    O        NA 0.7348701    NA        NA 0.6883963 0.7561051 0.6769522           4
16    P        NA 0.7327434    NA        NA        NA 0.7511620        NA           2

Or, concisely:

df[rowSums(!is.na(df[-1])) > 1,]

Upvotes: 2

Related Questions