Mark Miller
Mark Miller

Reputation: 13103

convert data frame of counts to proportions in R

I have a data frame of counts by region over time. One row of the data frame contains the count totals for each column. I want to convert the data frame from counts to proportions by dividing each column cell by the count total for the respective column. Some columns contain missing observations. I have done this below using nested for-loops but suspect there might be a much easier way, perhaps using lapply. I also had trouble extracting the row of count totals.

I am posting this partly because it is time I learn to use the apply family of functions and I suspect they might be useful here, and partly because I had so much trouble creating the vector of count totals and suspect using [[ would have been helpful. Thank you for any advice on writing the above code more efficiently.

my.data = read.table(text = "
state    y1970  y1980  y1990  y2000
Alaska       4      6     NA      7
Iowa        10     20     30     40
Nevada     100    100    100    100
Ohio        50     60     NA     80
total      172    195    215    238
Wyoming      8      9     10     11
", sep = "", header = TRUE)

desired.result = read.table(text = "
state         y1970       y1980       y1990       y2000
Alaska   0.02325581  0.03076923          NA  0.02941176  
Iowa     0.05813953  0.10256410  0.13953488  0.16806723  
Nevada   0.58139535  0.51282051  0.46511628  0.42016807  
Ohio     0.29069767  0.30769231          NA  0.33613445  
total    1.00000000  1.00000000  1.00000000  1.00000000  
Wyoming  0.04651163  0.04615385  0.04651163  0.04621849  
", sep = "", header = TRUE)

state  <- as.vector(unlist(my.data[, 1]))

my.totals <- as.vector(unlist(my.data[ my.data$state=='total', 2:5]))

proportions <- matrix(NA, nrow=nrow(my.data), ncol=ncol(my.data))
proportions <- as.data.frame(proportions)

for(i in 1:nrow(my.data)) {
 for(j in 1:ncol(my.data)) {

  if(j==1) proportions[i,1] <- state[i] 
  if(j> 1) proportions[i,j] <- my.data[i,j] / my.totals[j-1]

 }
}

colnames(proportions) <- names(my.data)
proportions


#     state      y1970      y1980      y1990      y2000
# 1  Alaska 0.02325581 0.03076923         NA 0.02941176
# 2    Iowa 0.05813953 0.10256410 0.13953488 0.16806723
# 3  Nevada 0.58139535 0.51282051 0.46511628 0.42016807
# 4    Ohio 0.29069767 0.30769231         NA 0.33613445
# 5   total 1.00000000 1.00000000 1.00000000 1.00000000
# 6 Wyoming 0.04651163 0.04615385 0.04651163 0.04621849

Upvotes: 5

Views: 13254

Answers (2)

HBat
HBat

Reputation: 5682

Alternatively you can:

library(tidyverse)

my.data = read.table(text = "
state    y1970  y1980  y1990  y2000
Alaska       4      6     NA      7
Iowa        10     20     30     40
Nevada     100    100    100    100
Ohio        50     60     NA     80
total      172    195    215    238
Wyoming      8      9     10     11
", sep = "", header = TRUE)

my.data %>% 
  # Convert table into long format
  pivot_longer(cols = -state, names_to = "year") %>% 
  # (Optional) Convert year to numeric:
  mutate(year = as.numeric(gsub("^y", "", year))) %>%  
  # Convert data frame to a table
  xtabs(formula = value ~ state + year) %>% 
  # Calculate proportions: 
  prop.table
#>          year
#> state            1970        1980        1990        2000
#>   Alaska  0.002555911 0.003833866 0.000000000 0.004472843
#>   Iowa    0.006389776 0.012779553 0.019169329 0.025559105
#>   Nevada  0.063897764 0.063897764 0.063897764 0.063897764
#>   Ohio    0.031948882 0.038338658 0.000000000 0.051118211
#>   total   0.109904153 0.124600639 0.137380192 0.152076677
#>   Wyoming 0.005111821 0.005750799 0.006389776 0.007028754

Upvotes: 0

IRTFM
IRTFM

Reputation: 263342

Probably something along these lines:

df[, -1] <- lapply( df[ , -1], function(x) x/sum(x, na.rm=TRUE) )

If it were a matrix you could have just used prop.table(mat). In this case however you need to limit to working only on the numeric columns (by excluding the first one).

Furthermore I think you need to exclude the "total" row:

 my.data[-5, -1] <- lapply( my.data[ -5 , -1], function(x){ x/sum(x, na.rm=TRUE)} )
 my.data[ -5 , ]
    state      y1970      y1980      y1990      y2000
1  Alaska 0.02325581 0.03076923         NA 0.02941176
2    Iowa 0.05813953 0.10256410 0.21428571 0.16806723
3  Nevada 0.58139535 0.51282051 0.71428571 0.42016807
4    Ohio 0.29069767 0.30769231         NA 0.33613445
6 Wyoming 0.04651163 0.04615385 0.07142857 0.04621849

-------------

Alternate approach:

> my.data[,-1] <-lapply( my.data[  , -1], function(x){ x/x[5] } )
> my.data
    state      y1970      y1980      y1990      y2000
1  Alaska 0.02325581 0.03076923         NA 0.02941176
2    Iowa 0.05813953 0.10256410 0.13953488 0.16806723
3  Nevada 0.58139535 0.51282051 0.46511628 0.42016807
4    Ohio 0.29069767 0.30769231         NA 0.33613445
5   total 1.00000000 1.00000000 1.00000000 1.00000000
6 Wyoming 0.04651163 0.04615385 0.04651163 0.04621849

This shows what prop.table will return with missing values when used on both margins and then on rows and columns separately for a very simple matrix:

> prop.table( matrix( c( 1,2,NA, 3),2) )
     [,1] [,2]
[1,]   NA   NA
[2,]   NA   NA
> prop.table( matrix( c( 1,2,NA, 3),2), 1 )
     [,1] [,2]
[1,]   NA   NA
[2,]  0.4  0.6
> prop.table( matrix( c( 1,2,NA, 3),2), 2 )
          [,1] [,2]
[1,] 0.3333333   NA
[2,] 0.6666667   NA

Upvotes: 4

Related Questions