Reputation: 663
I have my dataframe that looks like this:
df <- structure(list(V1 = c(30L, 30L, 32L, 48L, 42L, 29L, 36L, 28L,
53L, 49L, 25L), V2 = structure(c(5L, 5L, 1L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 1L), .Label = c(" ?", " Federal-gov", " Local-gov",
" Never-worked", " Private", " Self-emp-inc", " Self-emp-not-inc",
" State-gov", " Without-pay"), class = "factor"), V3 = c(188146L,
59496L, 293936L, 149640L, 116632L, 105598L, 155537L, 183175L,
169846L, 191681L, 200681L), V4 = structure(c(12L, 10L, 6L, 12L,
11L, 16L, 12L, 16L, 12L, 16L, 16L), .Label = c(" 10th", " 11th",
" 12th", " 1st-4th", " 5th-6th", " 7th-8th", " 9th", " Assoc-acdm",
" Assoc-voc", " Bachelors", " Doctorate", " HS-grad", " Masters",
" Preschool", " Prof-school", " Some-college"), class = "factor"),
V5 = c(9L, 13L, 4L, 9L, 16L, 10L, 9L, 10L, 9L, 10L, 10L),
V6 = structure(c(3L, 3L, 4L, 3L, 3L, 1L, 3L, 1L, 3L, 3L,
5L), .Label = c(" Divorced", " Married-AF-spouse", " Married-civ-spouse",
" Married-spouse-absent", " Never-married", " Separated",
" Widowed"), class = "factor"), V7 = structure(c(8L, 13L,
1L, 15L, 11L, 14L, 4L, 2L, 2L, 5L, 1L), .Label = c(" ?",
" Adm-clerical", " Armed-Forces", " Craft-repair", " Exec-managerial",
" Farming-fishing", " Handlers-cleaners", " Machine-op-inspct",
" Other-service", " Priv-house-serv", " Prof-specialty",
" Protective-serv", " Sales", " Tech-support", " Transport-moving"
), class = "factor"), V8 = structure(c(1L, 1L, 2L, 1L, 1L,
2L, 1L, 2L, 6L, 1L, 4L), .Label = c(" Husband", " Not-in-family",
" Other-relative", " Own-child", " Unmarried", " Wife"), class = "factor"),
V9 = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L), .Label = c(" Amer-Indian-Eskimo", " Asian-Pac-Islander",
" Black", " Other", " White"), class = "factor"), V10 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c(" Female",
" Male"), class = "factor"), V11 = c(5013L, 2407L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), V12 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), V13 = c(40L, 40L, 40L, 40L, 45L,
58L, 40L, 40L, 40L, 50L, 40L), V14 = structure(c(40L, 40L,
1L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L), .Label = c(" ?",
" Cambodia", " Canada", " China", " Columbia", " Cuba", " Dominican-Republic",
" Ecuador", " El-Salvador", " England", " France", " Germany",
" Greece", " Guatemala", " Haiti", " Holand-Netherlands",
" Honduras", " Hong", " Hungary", " India", " Iran", " Ireland",
" Italy", " Jamaica", " Japan", " Laos", " Mexico", " Nicaragua",
" Outlying-US(Guam-USVI-etc)", " Peru", " Philippines", " Poland",
" Portugal", " Puerto-Rico", " Scotland", " South", " Taiwan",
" Thailand", " Trinadad&Tobago", " United-States", " Vietnam",
" Yugoslavia"), class = "factor"), V15 = structure(c(1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L), .Label = c(" <=50K",
" >50K"), class = "factor")), row.names = 60:70, class = "data.frame")
I want to get rid of rows that have \\?
.
I tried this code below, but I am not getting what I need from this. Can someone please help me?
new_mtx <- apply (df, 1, function(x) any(!grepl("\\?", x)) )
df[new_mtx,]
Upvotes: 1
Views: 97
Reputation: 887038
We can use
library(dplyr)
library(stringr)
df %>%
filter_all(any_vars(str_detect(., "\\?")))%>%
anti_join(df,.)
Or with all_vars
df %>%
filter_all(all_vars(!str_detect(., "\\?")))
Or with base R
df[!Reduce(`|`, lapply(df, grepl, pattern = "?", fixed = TRUE)),]
Upvotes: 1
Reputation: 6485
Solution using dplyr
:
library(dplyr)
df %>%
filter_at(vars(-V1), all_vars(!. %in% c(" ?")))
Returns:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
1 30 Private 188146 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 5013 0 40 United-States <=50K
2 30 Private 59496 Bachelors 13 Married-civ-spouse Sales Husband White Male 2407 0 40 United-States <=50K
3 48 Private 149640 HS-grad 9 Married-civ-spouse Transport-moving Husband White Male 0 0 40 United-States <=50K
4 42 Private 116632 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 45 United-States >50K
5 29 Private 105598 Some-college 10 Divorced Tech-support Not-in-family White Male 0 0 58 United-States <=50K
6 36 Private 155537 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States <=50K
7 28 Private 183175 Some-college 10 Divorced Adm-clerical Not-in-family White Female 0 0 40 United-States <=50K
8 53 Private 169846 HS-grad 9 Married-civ-spouse Adm-clerical Wife White Female 0 0 40 United-States >50K
9 49 Self-emp-inc 191681 Some-college 10 Married-civ-spouse Exec-managerial Husband White Male 0 0 50 United-States >50K
Upvotes: 1
Reputation: 21400
df[-which(grepl("\\?", apply(df, 1, paste0, collapse = " "))),]
V1 V2 V3 V4 V5 V6 V7 V8 V9
60 30 Private 188146 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White
61 30 Private 59496 Bachelors 13 Married-civ-spouse Sales Husband White
63 48 Private 149640 HS-grad 9 Married-civ-spouse Transport-moving Husband White
64 42 Private 116632 Doctorate 16 Married-civ-spouse Prof-specialty Husband White
65 29 Private 105598 Some-college 10 Divorced Tech-support Not-in-family White
66 36 Private 155537 HS-grad 9 Married-civ-spouse Craft-repair Husband White
67 28 Private 183175 Some-college 10 Divorced Adm-clerical Not-in-family White
68 53 Private 169846 HS-grad 9 Married-civ-spouse Adm-clerical Wife White
69 49 Self-emp-inc 191681 Some-college 10 Married-civ-spouse Exec-managerial Husband White
V10 V11 V12 V13 V14 V15
60 Male 5013 0 40 United-States <=50K
61 Male 2407 0 40 United-States <=50K
63 Male 0 0 40 United-States <=50K
64 Male 0 0 45 United-States >50K
65 Male 0 0 58 United-States <=50K
66 Male 0 0 40 United-States <=50K
67 Female 0 0 40 United-States <=50K
68 Female 0 0 40 United-States >50K
69 Male 0 0 50 United-States >50K
Alternatively:
df[!grepl("\\?", apply(df, 1, paste0, collapse = " ")),]
Upvotes: 2