Reputation: 21
I would like to reshape a file but the typical use of reshape doesn't apply to my problem (I don't think). I could do this by hand but it would be error prone and I wouldn't learn how to use R to accomplish this. So after reading about Reshape and Tidyr, I decided to come here.
I have a file of electrical utilities and the counties in which they operate. I want to be able to reshape it so that it is a list of counties and what electrical utilities operate within them. I also have a variable for whether I have data for each utility in another dataset called "InData". Then I will determine on this basis what is the degree of missingness for each county.
The column names are:
UtilityName, InData, County1,County2,County3,County4... County12
Is there a simple way to reshape this or do I need to create some kind of loop going through each column?
Here is and example of the data:>
dput(utility_county)
structure(list(UtilityName = c("Alder Mutual Light Co Inc", "Avista Corporation",
"Asotin County PUD", "Benton County PUD", "Blaine City Light",
"Centralia City Light", "Chelan County PUD", "City of Cashmere",
"City of Chewelah, Electric Department", "City of Cheney", "City of Coulee Dam Light Dept",
"City of Ellensburg", "City of McCleary", "City of Milton", "City of Richland",
"City of Sumas", "Clallam County PUD", "Clark County PUD", "Clearwater Power",
"Columbia Rural Electric", "Cowlitz County PUD", "Douglas County PUD",
"Elmhurst Power & Light Co", "Ferry County PUD", "Franklin County PUD",
"Grant County PUD", "Grays Harbor County PUD", "Inland Power & Light",
"Jefferson County PUD", "Kittitas County PUD", "Klickitat County PUD",
"Kootenai Electric Cooperative Inc", "Lakeview Light & Power",
"Lewis County PUD", "Mason County PUD1", "Mason County PUD3",
"Modern Electric Water Company", "Nespelem Valley Electric Cooperative",
"Ohop Mutual Light Co", "Okanogan PUD", "Orcas Power and Light Coop",
"Pacific County PUD", "Pacific Power", "Parkland Light & Power",
"Pend Oreille PUD", "Peninsula Light Company", "Port Angeles City Light",
"Puget Sound Energy", "Seattle City Light", "Skamania PUD", "Snohomish County PUD",
"Tacoma Public Utilities", "Tanner Electric Cooperative", "Town of Eatonville",
"Town of Ruston", "Town of Steilacoom", "Vera Water & Power",
"Wahkiakum County PUD", "Whatcom County PUD", "Big Bend Electric",
"Northern Lights Inc"), InData = c(0L, 1L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L), UtilityType = c("Coop", "Private", "Public",
"Public", "Public", "Public", "Public", "Public", "Public", "Public",
"Public", "Public", "Public", "Public", "Public", "Public", "Public",
"Public", "Coop", "Coop", "Public", "Public", "Coop", "Public",
"Public", "Public", "Public", "Coop", "Public", "Public", "Public",
"Coop", "Coop", "Public", "Public", "Public", "Public", "Coop",
"Coop", "Public", "Coop", "Public", "Private", "Coop", "Public",
"Coop", "Public", "Private", "Public", "Public", "Public", "Public",
"Coop", "Public", "Public", "Public", "Coop", "Public", "Public",
"Coop", "Coop"), County1 = c("Pierce", "Whitman", "Asotin", "Benton",
"Whatcom", "Lewis", "Chelan", "Chelan", "Stevens", "Spokane",
"Grant", "Kittitas", "Grays Harbor", "Fulton County", "Benton County",
"Whatcom", "Clallam", "Clark", "Asotin ", "Walla Walla", "Cowlitz",
"Douglas", "Pierce", "Ferry", "Franklin", "Grant", "Grays Harbor",
"Spokane", "Jefferson", "Kittitas", "Klickitat", "Spokane", "Grant",
"Lewis", "Mason", "Mason", "Spokane", "Okanogan", "Pierce", "Okanogan",
"San Juan", "Pacific", "Yakima", "Pierce", "Pend Oreille", "Pierce",
"Clallam", "Island", "King", "Skamania", "Snohomish", "Pierce",
"King", "Pierce", "Pierce", "Pierce", "Spokane", "Wahkiakum",
"Whatcom", "Adams", "Pend Oreille"), County2 = c("", "Spokane",
"", "", "", "", "", "", "", "", "Douglas", "", "", "", "", "",
"", "", "Whitman", "Columbia", "", "", "", "Okanogan", "", "",
"", "Whitman", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "Garfield", "", "", "", "", "King", "", "", "Island",
"Mason", "Pierce", "", "", "", "", "", "", "", ""), County3 = c("",
"Stevens", "", "", "", "", "", "", "", "", "Okanogan", "", "",
"", "", "", "", "", "Garfield", "", "", "", "", "", "", "", "",
"Lincoln", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "Columbia", "", "", "", "", "Kitsap", "", "", "", "Grays Harbor",
"", "", "", "", "", "", "", "", ""), County4 = c("", "Lincoln",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "Stevens", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "Walla Walla", "", "",
"", "", "Kittitas", "", "", "", "Lewis", "", "", "", "", "",
"", "", "", ""), County5 = c("", "Asotin", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "Garfield", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "Benton", "", "", "", "", "Pierce", "", "",
"", "", "", "", "", "", "", "", "", "", ""), County6 = c("",
"Adams", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "Adams", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "Kittitas",
"", "", "", "", "Skagit", "", "", "", "", "", "", "", "", "",
"", "", "", ""), County7 = c("", "Ferry", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "Columbia", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "Thurston", "", "", "",
"", "", "", "", "", "", "", "", "", ""), County8 = c("", "Franklin",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "Pend Orielle", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"Whatcom", "", "", "", "", "", "", "", "", "", "", "", "", ""
), County9 = c("", "Grant", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"Grant", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", ""), County10 = c("", "Pend Oreille", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "Asotin", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "")), row.names = c(NA, 61L
), class = "data.frame")
Upvotes: 0
Views: 77
Reputation: 1081
utility_county_long =
reshape(utility_county,
direction = 'long',
varying = grep('County\\d+', names(utility_county), value = T),
timevar = 'county_id',
v.names = 'county_name',
idvar = grep('County\\d+', names(utility_county), value = T, invert=T))
rownames(utility_county_long) = NULL # reshape concatenates the idvars into rownames
# but we don't need them
# make empties explicit NAs
utility_county_long[utility_county_long$county_name == "", 'county_name'] = NA
# remove white spaces
utility_county_long = transform(utility_county_long,
UtilityName = trimws(UtilityName, 'both'),
county_name = trimws(county_name))
head(utility_county_long, 10)
counts = aggregate(list(N=utility_county_long$UtilityName),
list(county=utility_county_long$county_name),
length)
# extra: add pasted-together utility names as in @Ian Campbell's answer
counts$utilities = aggregate(list(utilities = utility_county_long$UtilityName),
list(county = utility_county_long$county_name),
\(x) paste(x, collapse = ", "))$utilities
counts[, c("county", "N")]
county N
1 Adams 3
2 Asotin 4
3 Benton 2
4 Benton County 1
5 Chelan 2
6 Clallam 2
7 Clark 1
8 Columbia 3
9 Cowlitz 1
10 Douglas 2
11 Ferry 2
12 Franklin 2
13 Fulton County 1
14 Garfield 3
15 Grant 5
16 Grays Harbor 3
17 Island 2
18 Jefferson 1
19 King 3
20 Kitsap 1
21 Kittitas 4
22 Klickitat 1
23 Lewis 3
24 Lincoln 2
25 Mason 3
26 Okanogan 4
27 Pacific 1
28 Pend Oreille 3
29 Pend Orielle 1
30 Pierce 11
31 San Juan 1
32 Skagit 1
33 Skamania 1
34 Snohomish 1
35 Spokane 6
36 Stevens 3
37 Thurston 1
38 Wahkiakum 1
39 Walla Walla 2
40 Whatcom 4
41 Whitman 3
42 Yakima 1
Upvotes: 0
Reputation: 24838
Here's an approach with tidyr
:
library(dplyr);library(tidyr)
utility_county %>%
pivot_longer(cols = -c(UtilityName:UtilityType),values_to = "County") %>%
filter(County != "") %>%
group_by(County) %>%
summarise(Utilities = paste(UtilityName, collapse = ", "),
Missing = n() - sum(InData))
## A tibble: 43 x 3
# County Utilities Missing
# <chr> <chr> <int>
# 1 "Adams" Avista Corporation, Inland Power & Light, Big Bend Electric 1
# 2 "Asotin" Avista Corporation, Asotin County PUD, Inland Power & Light 1
# 3 "Asotin " Clearwater Power 0
# 4 "Benton" Benton County PUD, Pacific Power 0
# 5 "Benton County" City of Richland 1
# 6 "Chelan" Chelan County PUD, City of Cashmere 2
# 7 "Clallam" Clallam County PUD, Port Angeles City Light 1
# 8 "Clark" Clark County PUD 0
# 9 "Columbia" Columbia Rural Electric, Inland Power & Light, Pacific Power 1
#10 "Cowlitz" Cowlitz County PUD 0
## … with 33 more rows
Note that your data has "Asotin "
, but you could fix that ahead of time if you wanted.
Upvotes: 1
Reputation: 73352
Using reshape
.
dat[] <- lapply(dat, \(x) {x[x == ''] <- NA;x}) ## encode missings first
res <- reshape(dat, varying=4:13, sep='', direction='long', timevar='County',
v.names='FOO')
head(res)
# tilityName InData UtilityType County FOO id
# 1.1 Alder Mutual Light Co Inc 0 Coop 1 Pierce 1
# 2.1 Avista Corporation 1 Private 1 Whitman 2
# 3.1 Asotin County PUD 0 Public 1 Asotin 3
# 4.1 Benton County PUD 1 Public 1 Benton 4
# 5.1 Blaine City Light 0 Public 1 Whatcom 5
# 6.1 Centralia City Light 0 Public 1 Lewis 6
dim(res)
# [1] 610 6
Upvotes: 0