Reputation: 663

Remove rows matching symbol/pattern in multiple columns in R

I have my dataframe that looks like this:

df <- structure(list(V1 = c(30L, 30L, 32L, 48L, 42L, 29L, 36L, 28L, 
53L, 49L, 25L), V2 = structure(c(5L, 5L, 1L, 5L, 5L, 5L, 5L, 
5L, 5L, 6L, 1L), .Label = c(" ?", " Federal-gov", " Local-gov", 
" Never-worked", " Private", " Self-emp-inc", " Self-emp-not-inc", 
" State-gov", " Without-pay"), class = "factor"), V3 = c(188146L, 
59496L, 293936L, 149640L, 116632L, 105598L, 155537L, 183175L, 
169846L, 191681L, 200681L), V4 = structure(c(12L, 10L, 6L, 12L, 
11L, 16L, 12L, 16L, 12L, 16L, 16L), .Label = c(" 10th", " 11th", 
" 12th", " 1st-4th", " 5th-6th", " 7th-8th", " 9th", " Assoc-acdm", 
" Assoc-voc", " Bachelors", " Doctorate", " HS-grad", " Masters", 
" Preschool", " Prof-school", " Some-college"), class = "factor"), 
    V5 = c(9L, 13L, 4L, 9L, 16L, 10L, 9L, 10L, 9L, 10L, 10L), 
    V6 = structure(c(3L, 3L, 4L, 3L, 3L, 1L, 3L, 1L, 3L, 3L, 
    5L), .Label = c(" Divorced", " Married-AF-spouse", " Married-civ-spouse", 
    " Married-spouse-absent", " Never-married", " Separated", 
    " Widowed"), class = "factor"), V7 = structure(c(8L, 13L, 
    1L, 15L, 11L, 14L, 4L, 2L, 2L, 5L, 1L), .Label = c(" ?", 
    " Adm-clerical", " Armed-Forces", " Craft-repair", " Exec-managerial", 
    " Farming-fishing", " Handlers-cleaners", " Machine-op-inspct", 
    " Other-service", " Priv-house-serv", " Prof-specialty", 
    " Protective-serv", " Sales", " Tech-support", " Transport-moving"
    ), class = "factor"), V8 = structure(c(1L, 1L, 2L, 1L, 1L, 
    2L, 1L, 2L, 6L, 1L, 4L), .Label = c(" Husband", " Not-in-family", 
    " Other-relative", " Own-child", " Unmarried", " Wife"), class = "factor"), 
    V9 = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
    5L), .Label = c(" Amer-Indian-Eskimo", " Asian-Pac-Islander", 
    " Black", " Other", " White"), class = "factor"), V10 = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c(" Female", 
    " Male"), class = "factor"), V11 = c(5013L, 2407L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), V12 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L), V13 = c(40L, 40L, 40L, 40L, 45L, 
    58L, 40L, 40L, 40L, 50L, 40L), V14 = structure(c(40L, 40L, 
    1L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L), .Label = c(" ?", 
    " Cambodia", " Canada", " China", " Columbia", " Cuba", " Dominican-Republic", 
    " Ecuador", " El-Salvador", " England", " France", " Germany", 
    " Greece", " Guatemala", " Haiti", " Holand-Netherlands", 
    " Honduras", " Hong", " Hungary", " India", " Iran", " Ireland", 
    " Italy", " Jamaica", " Japan", " Laos", " Mexico", " Nicaragua", 
    " Outlying-US(Guam-USVI-etc)", " Peru", " Philippines", " Poland", 
    " Portugal", " Puerto-Rico", " Scotland", " South", " Taiwan", 
    " Thailand", " Trinadad&Tobago", " United-States", " Vietnam", 
    " Yugoslavia"), class = "factor"), V15 = structure(c(1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L), .Label = c(" <=50K", 
    " >50K"), class = "factor")), row.names = 60:70, class = "data.frame")

I want to get rid of rows that have \\?. I tried this code below, but I am not getting what I need from this. Can someone please help me?

new_mtx  <- apply (df, 1, function(x) any(!grepl("\\?", x)) )
df[new_mtx,]

Upvotes: 1

Answers (3)

akrun

Reputation: 887038

We can use

library(dplyr)
library(stringr)
df %>% 
     filter_all(any_vars(str_detect(., "\\?")))%>%
     anti_join(df,.)

Or with all_vars

df %>% 
    filter_all(all_vars(!str_detect(., "\\?")))

Or with base R

df[!Reduce(`|`, lapply(df, grepl,  pattern = "?", fixed = TRUE)),]

Upvotes: 1

dario

Reputation: 6485

Solution using dplyr:

  library(dplyr)
  df %>% 
    filter_at(vars(-V1), all_vars(!. %in% c(" ?")))

Returns:

    V1            V2     V3            V4 V5                  V6                 V7             V8     V9     V10  V11 V12 V13            V14    V15
  1 30       Private 188146       HS-grad  9  Married-civ-spouse  Machine-op-inspct        Husband  White    Male 5013   0  40  United-States  <=50K
  2 30       Private  59496     Bachelors 13  Married-civ-spouse              Sales        Husband  White    Male 2407   0  40  United-States  <=50K
  3 48       Private 149640       HS-grad  9  Married-civ-spouse   Transport-moving        Husband  White    Male    0   0  40  United-States  <=50K
  4 42       Private 116632     Doctorate 16  Married-civ-spouse     Prof-specialty        Husband  White    Male    0   0  45  United-States   >50K
  5 29       Private 105598  Some-college 10            Divorced       Tech-support  Not-in-family  White    Male    0   0  58  United-States  <=50K
  6 36       Private 155537       HS-grad  9  Married-civ-spouse       Craft-repair        Husband  White    Male    0   0  40  United-States  <=50K
  7 28       Private 183175  Some-college 10            Divorced       Adm-clerical  Not-in-family  White  Female    0   0  40  United-States  <=50K
  8 53       Private 169846       HS-grad  9  Married-civ-spouse       Adm-clerical           Wife  White  Female    0   0  40  United-States   >50K
  9 49  Self-emp-inc 191681  Some-college 10  Married-civ-spouse    Exec-managerial        Husband  White    Male    0   0  50  United-States   >50K

Upvotes: 1

Chris Ruehlemann

Reputation: 21400

df[-which(grepl("\\?", apply(df, 1, paste0, collapse = " "))),]
   V1            V2     V3            V4 V5                  V6                 V7             V8     V9
60 30       Private 188146       HS-grad  9  Married-civ-spouse  Machine-op-inspct        Husband  White
61 30       Private  59496     Bachelors 13  Married-civ-spouse              Sales        Husband  White
63 48       Private 149640       HS-grad  9  Married-civ-spouse   Transport-moving        Husband  White
64 42       Private 116632     Doctorate 16  Married-civ-spouse     Prof-specialty        Husband  White
65 29       Private 105598  Some-college 10            Divorced       Tech-support  Not-in-family  White
66 36       Private 155537       HS-grad  9  Married-civ-spouse       Craft-repair        Husband  White
67 28       Private 183175  Some-college 10            Divorced       Adm-clerical  Not-in-family  White
68 53       Private 169846       HS-grad  9  Married-civ-spouse       Adm-clerical           Wife  White
69 49  Self-emp-inc 191681  Some-college 10  Married-civ-spouse    Exec-managerial        Husband  White
       V10  V11 V12 V13            V14    V15
60    Male 5013   0  40  United-States  <=50K
61    Male 2407   0  40  United-States  <=50K
63    Male    0   0  40  United-States  <=50K
64    Male    0   0  45  United-States   >50K
65    Male    0   0  58  United-States  <=50K
66    Male    0   0  40  United-States  <=50K
67  Female    0   0  40  United-States  <=50K
68  Female    0   0  40  United-States   >50K
69    Male    0   0  50  United-States   >50K

Alternatively:

df[!grepl("\\?", apply(df, 1, paste0, collapse = " ")),]

Upvotes: 2

Remove rows matching symbol/pattern in multiple columns in R

Answers (3)

Related Questions