Reputation: 99
I am having trouble using the filter verb. Below is a small sample of my dataset.
structure(list(employer = c("MICROSOFT CORPORATION", "GOOGLE INC",
"GOOGLE INC", "AMAZON CORPORATE LLC", "MICROSOFT CORPORATION",
"GOOGLE INC", "AMAZON CORPORATE LLC", "MICROSOFT CORPORATION",
"MICROSOFT CORPORATION", "MICROSOFT CORPORATION", "AMAZON CORPORATE LLC",
"APPLE INC", "AMAZON CORPORATE LLC", "AMAZON CORPORATE LLC",
"YAHOO HOLDINGS INC", "APPLE INC", "AMAZON CORPORATE LLC", "GOOGLE INC",
"AMAZON WEB SERVICES INC", "GOOGLE INC", "AMAZONCOMKYDC LLC",
"LINKEDIN CORPORATION", "FACEBOOK INC", "GOOGLE INC", "GOOGLE INC",
"GOOGLE INC", "AMAZON CORPORATE LLC", "AMAZON CORPORATE LLC",
"MICROSOFT CORPORATION", "GOOGLE INC", "GOOGLE INC", "AMAZON CORPORATE LLC",
"AIRBNB INC", "MICROSOFT CORPORATION", "GOOGLE INC", "GOOGLE INC",
"GOOGLE INC", "AMAZON CORPORATE LLC", "GOOGLE INC", "YAHOO! INC",
"AMAZON CORPORATE LLC", "MICROSOFT CORPORATION", "MICROSOFT CORPORATION",
"GOOGLE INC", "FACEBOOK INC", "AIRBNB INC", "MICROSOFT CORPORATION",
"APPLE INC", "UBER TECHNOLOGIES INC", "MICROSOFT CORPORATION"
), job.title = c("SOFTWARE ENGINEER", "STRATEGIST", "TEST ENGINEER",
"TECHNICAL PROGRAM MANAGER", "PROGRAM MANAGER", "SOFTWARE ENGINEER, SITE RELIABILITY ENGINEERING",
"SOFTWARE DEVELOPMENT ENGINEER II", "SENIOR SOFTWARE ENGINEER",
"SOFTWARE ENGINEER 2", "SENIOR SOFTWARE ENGINEER", "SENIOR PRODUCT MANAGER",
"ENGINEERING PROJECT MGR 4", "PROGRAM MANAGER", "BUSINESS INTELLIGENCE ENGINEER I",
"TECH YAHOO, SOFTWARE DEV ENGINEER", "SOFTWARE ENGINEER APPS",
"SOFTWARE DEVELOPMENT ENGINEER I", "SOFTWARE ENGINEER", "SECURITY ENGINEER II",
"SOFTWARE ENGINEER", "OPERATIONS MANAGER", "SOFTWARE ENGINEER",
"SOFTWARE ENGINEER", "SOFTWARE ENGINEER", "TECHNICAL ACCOUNT MANAGER",
"ANALYTICAL LEAD", "PRODUCT MANAGER II", "SOFTWARE DEVELOPMENT ENGINEER II",
"SENIOR PROGRAM MANAGER", "SOFTWARE ENGINEER", "SOFTWARE ENGINEER",
"SOFTWARE DEVELOPMENT ENGINEER III", "SOFTWARE ENGINEER", "PROGRAM MANAGER",
"SALES STRATEGY ASSOCIATE", "SOFTWARE ENGINEER", "SOFTWARE ENGINEER 1615.20269",
"SOFTWARE DEVELOPMENT ENGINEER II", "SOFTWARE ENGINEER", "TECH YAHOO, SOFTWARE DEVELOPMENT ENGINEER, ASSOCIATE",
"NETWORK DEVELOPMENT ENGINEER I", "SOFTWARE DEVELOPMENT ENGINEER IN TEST",
"SENIOR SOFTWARE ENGINEERING MANAGER", "SOLUTIONS CONSULTANT",
"DATA SCIENTIST", "SOFTWARE ENGINEER", "SUPPORT ENGINEER", "SYSTEMS DESIGN ENGINEER 3",
"SOFTWARE ENGINEER", "PREMIER FIELD ENGINEER"), base.salary = c("125,003",
"110,000", "125,100", "155,000", "117,218", "104,000", "120,700",
"145,301", "140,000", "141,123", "115,000", "137,571", "105,500",
"93,000", "123,628", "150,000", "99,200", "108,000", "135,000",
"110,000", "90,000", "131,997", "110,000", "115,000", "108,000",
"91,000", "110,000", "144,000", "160,250", "127,000", "132,000",
"153,900", "125,000", "124,989", "110,200", "150,000", "132,000",
"112,000", "120,000", "96,866", "105,000", "94,139", "156,123",
"97,500", "117,453", "120,000", "92,500", "97,386", "111,405",
"109,811"), location = c("BELLEVUE, WA", "MOUNTAIN VIEW, CA",
"MOUNTAIN VIEW, CA", "SEATTLE, WA", "REDMOND, WA", "VENICE, CA",
"SEATTLE, WA", "REDMOND, WA", "SAN FRANCISCO, CA", "REDMOND, WA",
"SEATTLE, WA", "CUPERTINO, CA", "SEATTLE, WA", "SEATTLE, WA",
"SUNNYVALE, CA", "CUPERTINO, CA", "SEATTLE, WA", "MOUNTAIN VIEW, CA",
"SEATTLE, WA", "MOUNTAIN VIEW, CA", "ORLANDO, FL", "NEW YORK, NY",
"MENLO PARK, CA", "PITTSBURGH, PA", "MOUNTAIN VIEW, CA", "NEW YORK, NY",
"SEATTLE, WA", "SEATTLE, WA", "REDMOND, WA", "MOUNTAIN VIEW, CA",
"MOUNTAIN VIEW, CA", "SEATTLE, WA", "SAN FRANCISCO, CA", "REDMOND, WA",
"MOUNTAIN VIEW, CA", "PALO ALTO, CA", "MOUNTAIN VIEW, CA", "SEATTLE, WA",
"KIRKLAND, WA", "SAN FRANCISCO, CA", "SEATTLE, WA", "ISSAQUAH, WA",
"REDMOND, WA", "NEW YORK, NY", "MENLO PARK, CA", "SAN FRANCISCO, CA",
"SEATTLE, WA", "CUPERTINO, CA", "NEW YORK, NY", "BENTONVILLE, AR"
), submit.date = c("12/27/2016", "06/08/2016", "06/02/2016",
"05/22/2017", "11/04/2014", "02/25/2016", "02/27/2014", "11/13/2014",
"06/15/2017", "11/20/2014", "02/04/2017", "06/15/2017", "02/24/2017",
"06/19/2015", "02/17/2017", "11/04/2016", "01/13/2017", "05/15/2015",
"02/04/2014", "11/08/2013", "03/16/2017", "11/18/2016", "01/08/2014",
"05/07/2014", "10/22/2013", "02/16/2017", "08/21/2015", "04/29/2016",
"08/25/2016", "02/18/2015", "03/17/2016", "06/14/2017", "02/12/2015",
"10/01/2015", "02/27/2015", "12/14/2015", "02/09/2017", "03/09/2015",
"05/12/2016", "03/03/2016", "06/11/2014", "12/06/2013", "01/19/2015",
"02/22/2016", "02/10/2015", "02/18/2017", "03/17/2017", "06/18/2014",
"07/25/2016", "11/16/2015"), start.date = c("06/26/2017", "10/01/2016",
"10/22/2016", "06/05/2017", "11/17/2014", "08/23/2016", "08/25/2014",
"05/11/2015", "06/28/2017", "05/16/2015", "07/30/2017", "10/28/2017",
"08/04/2017", "07/20/2015", "03/01/2017", "11/21/2016", "07/14/2017",
"09/08/2015", "02/07/2014", "01/06/2014", "03/27/2017", "12/05/2016",
"07/04/2014", "10/03/2014", "11/04/2013", "08/18/2017", "09/14/2015",
"10/23/2016", "10/01/2016", "08/17/2015", "03/24/2016", "11/14/2017",
"08/01/2015", "04/01/2016", "08/21/2015", "01/25/2016", "07/21/2017",
"08/30/2015", "08/12/2016", "09/01/2016", "06/18/2014", "06/04/2014",
"06/11/2015", "08/20/2016", "08/07/2015", "08/01/2017", "09/15/2017",
"09/02/2014", "07/28/2016", "11/23/2015"), case.status = c("CERTIFIED",
"CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED",
"WITHDRAWN", "CERTIFIED", "CERTIFIED", "CERTIFIED", "DENIED",
"CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED",
"CERTIFIED", "CERTIFIED", "WITHDRAWN", "CERTIFIED", "CERTIFIED",
"CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED",
"CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED",
"CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED",
"CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED",
"CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED",
"CERTIFIED", "CERTIFIED", "CERTIFIED", "CERTIFIED")), .Names = c("employer",
"job.title", "base.salary", "location", "submit.date", "start.date",
"case.status"), row.names = c(49523L, 34286L, 34811L, 11521L,
39957L, 33899L, 8005L, 43122L, 51506L, 42828L, 4681L, 13148L,
3377L, 904L, 56070L, 15872L, 6070L, 25408L, 4268L, 25972L, 2556L,
36551L, 19938L, 26637L, 34433L, 21937L, 3178L, 9001L, 41880L,
27560L, 28258L, 9576L, 227L, 40098L, 24335L, 29791L, 31987L,
7452L, 26970L, 56520L, 2391L, 45909L, 44112L, 34167L, 18377L,
171L, 51780L, 17635L, 54413L, 39161L), class = "data.frame")
I had to clean the original dataset because when I looked at unique employers (of the original dataset) I had a lot of extra data:
unique(cleanH1B$employer)
[1] "AIRBNB INC"
[2] "AMAZON CORPORATE LLC"
[3] "AMAZON MEDIA GROUP LLC"
[4] "AMAZON MEDIA GROUP"
[5] "AMAZON WEB SERVICES INC"
[6] "AMAZON SERVICES LLC"
[7] "AMAZONCOMKYDC LLC"
[8] "AMAZON FULFILLMENT SERVICES INC"
[9] "AMAZONCOMAZDC LLC"
[10] "AMAZON PRODUCE NETWORK LLP"
[11] "AMAZONCOMDEDC LLC"
[12] "AMAZONCOMKSDC LLC"
[13] "AMAZON CAPITAL SERVICES INC"
[14] "AMAZON DIGITAL SERVICES INC"
[15] "AMAZON DIGITAL SERVICES LLC"
[16] "AMAZON WEB SERVICES LLC"
[17] "AMAZON COPORATE LLC"
[18] "AMAZON ROBOTICS LLC"
[19] "AMAZON WEB SERVICES"
[20] "AMAZON STUDIOS INC"
[21] "AMAZON MASONRY"
[22] "AMAZON PAYMENTS INC"
[23] "AMAZONCOM DEDC LLC"
[24] "AMAZONCOMINDC LLC"
[25] "AMAZONCOMKYDC"
[26] "AMAZON CORPORATE"
[27] "AMAZON STUDIOS LLC"
[28] "AMAZONCOMNVDC INC"
[29] "AMAZONCOMKYDC INC"
[30] "AMAZON FRESH LLC"
[31] "AMAZONCOMDEDCLLC"
[32] "AMAZON PHARMACY INC"
[33] "AMAZON WEB SEERVICES INC"
[34] "AMAZONCOM AZDZ LLC"
[35] "AMAZON FULFILLMENT SERVICE INC"
[36] "AMAZON FUFILLMENT SERVICES INC"
[37] "AMAZON REGISTRY SERVICES INC"
[38] "AMAZON TECHNOLOGIES INC"
[39] "AMAZON DEVELOPMENT CENTER INC"
[40] "AMAZON RESTAURANT & BAR INC"
[41] "AMAZON CORP LLC"
[42] "AMAZON FULFILLMENT SVCS INC"
[43] "AMAZON MECHANICAL TURK INC"
[44] "AMAZON CORPORTATE LLC"
[45] "AMAZON CAPTAL SERVICES INC"
[46] "AMAZON ROBOTICS LLC (KIVA)"
[47] "AMAZON CORPORTE LLC"
[48] "APPLET SYSTEMS LLC"
[49] "APPLE INC"
[50] "APPLE ALUM USA CORP"
[51] "APPLE FEDERAL CREDIT UNION"
[52] "APPLE DENTAL & ASSOCIATES LLC"
[53] "APPLE AMERICAN GROUP"
[54] "APPLETON GRP LLC"
[55] "APPLEXUS TECHNOLOGIES LLC"
[56] "APPLE AMERICAN GROUP LLC"
[57] "APPLE TREE DENTAL"
[58] "APPLE T USA INC"
[59] "APPLE AIR COMPRESSOR CORP"
[60] "APPLE BEAUTY INC"
[61] "APPLEGATE TRAN INTERIORS INC"
[62] "APPLE MEDICAL CENTER AND URGENT CARE"
[63] "APPLE MEDICAL CENTER AND URGENT CARE INC"
[64] "APPLESEED MONTESSORI SCHOOL"
[65] "APPLEPEA MONTESSORI ACADEMY OF ONTARIO"
[66] "APPLE SEEDS LLC"
[67] "APPLETREE INSTITUTE FOR EDUCATION INNOVATION INC"
[68] "APPLETREE DAY CARE CENTER INC"
[69] "APPLETREE EARLY LEARNING PUBLIC CHARTER SCHOOL"
[70] "APPLECHEM INC"
[71] "APPLEECON LLC"
[72] "APPLECON LLC"
[73] "APPLECRATE INC"
[74] "APPLEBY CAPITAL INC"
[75] "APPLE VACATIONS LLC"
[76] "FACEBOOK INC"
[77] "FACEBOOK MIAMI INC"
[78] "FACEBOOK SERVICES INC"
[79] "GOOGLE INC"
[80] "GOOGLE LIFE SCIENCES LLC"
[81] "GOOGLE CAPITAL MANAGEMENT COMPANY LLC"
[82] "GOOGLE VENTURES MANAGEMENT COMPANY LLC"
[83] "LINKEDIN CORPORATION"
[84] "MICROSOFT CORPORATION"
[85] "MICROSOFT OPERATIONS PUERTO RICO LLC"
[86] "TWITTER INC"
[87] "UBER TECHNOLOGIES INC"
[88] "UBERION INC"
[89] "UBERTAL INC"
[90] "UBERWURX LLC"
[91] "UBERTEJAS LLC"
[92] "UBERMEDIA INC"
[93] "UBER OPERATIONS LLC"
[94] "UBERLEGEN TECHNOLOGY GROUP LLC"
[95] "UBER BRAIN LLC"
[96] "UBERTO CONSTRUCTION"
[97] "UBER GROUP PLLC"
[98] "YAHOO! INC"
[99] "YAHOO INC"
[100] "YAHOO HOLDINGS INC"
[101] "YAHOO HOLDINGS"
My methodology was to identify all the unnecessary employer names and remove them with filter(). Below is my attempt to create a new dataframe by removing three rows associated with the following employers.
cleanH1B <- filter(df_h1b, employer != "AMAZON MASONRY" |
employer != "AMAZON AERO SERVICES LLC" |
employer != "APPLET SYSTEMS LLC")
However, my method does not work and I end up with the same original dataset. I also tried:
cleanH1B <- filter(df_h1b, employer != "AMAZON MASONRY",
employer != "AMAZON AERO SERVICES LLC",
employer != "APPLET SYSTEMS LLC")
Can anyone help point out how I am using filter() wrong and possibly share a more effective technique?
Thank you!
Upvotes: 0
Views: 835
Reputation: 6264
Change your filter to AND (&
) from OR (|
)
df %>% filter(employer != "AMAZON MASONRY" &
employer != "AMAZON AERO SERVICES LLC" &
employer != "APPLET SYSTEMS LLC")
# # A tibble: 50 x 7
# employer job.title base.salary
# <chr> <chr> <chr>
# 1 MICROSOFT CORPORATION SOFTWARE ENGINEER 125,003
# 2 GOOGLE INC STRATEGIST 110,000
# 3 GOOGLE INC TEST ENGINEER 125,100
# 4 AMAZON CORPORATE LLC TECHNICAL PROGRAM MANAGER 155,000
# 5 MICROSOFT CORPORATION PROGRAM MANAGER 117,218
# 6 GOOGLE INC SOFTWARE ENGINEER, SITE RELIABILITY ENGINEERING 104,000
# 7 AMAZON CORPORATE LLC SOFTWARE DEVELOPMENT ENGINEER II 120,700
# 8 MICROSOFT CORPORATION SENIOR SOFTWARE ENGINEER 145,301
# 9 MICROSOFT CORPORATION SOFTWARE ENGINEER 2 140,000
# 10 MICROSOFT CORPORATION SENIOR SOFTWARE ENGINEER 141,123
...if you want to simplify even further, you can assign your exclusions to a vector first.
emp_list <- c("AMAZON MASONRY", "AMAZON AERO SERVICES LLC", "APPLET SYSTEMS LLC")
df %>% filter(!employer %in% emp_list)
That way your filter is more straight forward.
Upvotes: 1