Moin Islam
Moin Islam

Reputation: 31

How to ignore beginning zeros in a row in R

I am thinking of calculating mean and std. dev. of every row in a dataset. However, I want to ignore the beginning zeros.

Row 1: 0 0 0 0 9 0 8 5 
Row 2: 0 0 3 5 6 0 0 0

I want to calculate my mean over [9 0 8 5] and [3 5 6 0 0 0]

Is there any easy way to do it R dataframe?

Upvotes: 3

Views: 104

Answers (4)

nurandi
nurandi

Reputation: 1618

Try:

c <- c(0, 0, 0, 0, 9, 0, 8, 5
      , 0, 0, 3, 5, 6, 0, 0, 0)
df <- as.data.frame(matrix(c, 2, 8, byrow = T))

for ( i in 1:2 ) { 
  x <- sapply(df[i, 1:8], as.numeric)
  y <- match(NA,match(x, 0))
  z <- x[y:8]
  df[i,"Avg"] <- mean(z)
  df[i,"Sd"] <- sd(z) 
}

rm(c,x,y,z)

df

#   V1 V2 V3 V4 V5 V6 V7 V8      Avg       Sd
# 1  0  0  0  0  9  0  8  5 5.500000 4.041452
# 2  0  0  3  5  6  0  0  0 2.333333 2.732520

Upvotes: 0

akrun
akrun

Reputation: 887691

Try

apply(df1, 1, function(x)
      mean(x[Position(function(y) y >0, x):length(x)]))
#[1] 5.500000 2.333333
apply(df1, 1, function(x) sd(x[Position(function(y)
     y >0, x):length(x)]))
#[1] 4.041452 2.732520

We could wrap this in a function

f1 <- function(dat, ...){
   args <- as.list(match.call())[-(1:2)]
   res <- sapply(args, function(FUN) apply(dat, 1, function(x){
            x <- x[Position(function(y) y > 0 & !is.na(y), x):length(x)]
          eval(FUN)(x, na.rm=TRUE)
     }
   ))

  colnames(res) <- args
  res
 }

f1(df1, mean)
#        mean
#[1,] 5.500000
#[2,] 2.333333
f1(df1, mean, sd, median)
#        mean       sd median
#[1,] 5.500000 4.041452    6.5
#[2,] 2.333333 2.732520    1.5

f1(df2, mean, sd)
#       mean       sd
#[1,] 7.333333 2.081666
#[2,] 1.500000 3.000000

f1(df3, mean, sd)
#        mean       sd
#[1,] 7.333333 2.081666
#[2,] 1.500000 3.000000

data

df1 <- structure(list(v1 = c(0L, 0L), v2 = c(0L, 0L), v3 = c(0L, 3L), 
v4 = c(0L, 5L), v5 = c(9L, 6L), v6 = c(0L, 0L), v7 = c(8L, 
0L), v8 = c(5L, 0L)), .Names = c("v1", "v2", "v3", "v4", 
"v5", "v6", "v7", "v8"), class = "data.frame", row.names = c(NA, -2L))

df2 <- structure(list(v1 = c(0L, 0L), v2 = c(0L, 0L), v3 = c(NA, 0), 
v4 = c(0, 0), v5 = c(9L, 6L), v6 = c(NA, 0L), v7 = c(8L, 
0L), v8 = c(5L, 0L)), .Names = c("v1", "v2", "v3", "v4", 
"v5", "v6", "v7", "v8"), row.names = c(NA, -2L), class = "data.frame")

df3 <- structure(list(v1 = c(0L, 0L), v2 = c(0L, 0L), v3 = c(0, 0), 
v4 = c(0, 0), v5 = c(9L, 6L), v6 = c(NA, 0L), v7 = c(8L, 
0L), v8 = c(5L, 0L)), .Names = c("v1", "v2", "v3", "v4", 
"v5", "v6", "v7", "v8"), row.names = c(NA, -2L), class = "data.frame")

Upvotes: 3

thelatemail
thelatemail

Reputation: 93938

How about this, using the vectorized rowMeans function?

rowMeans(replace(dat, col(dat) < max.col(dat != 0, ties.method="first"), NA), na.rm=TRUE)
#[1] 5.500000 2.333333

If speed is a concern over a large dataset, this will be much faster than using apply. If not, apply is definitely more readable.

Unfortunately, this method hurts flexibility a bit, as a rowX function for everything doesn't exist. There is however rowSds in the matrixStats package, which is also very quick:

library(matrixStats)
rowSds(as.matrix(replace(dat, col(dat) < max.col(dat != 0, ties.method="first"), NA)))
#[1] 4.041452 2.732520

Upvotes: 4

A5C1D2H2I1M1N2O1R2T1
A5C1D2H2I1M1N2O1R2T1

Reputation: 193667

Maybe not the most elegant, but you can make use of cumsum in this case.

Try:

> apply(mydf, 1, function(x) mean(x[cumsum(x) > 0]))
[1] 5.500000 2.333333

You can extend the idea by moving the function outside of apply so that you can customize the functions you want to add, like this:

myFun <- function(x) {
  x <- x[cumsum(x) > 0]
  c(mean = mean(x), sd = sd(x))
}

apply(mydf, 1, myFun)
#          [,1]     [,2]
# mean 5.500000 2.333333
# sd   4.041452 2.732520

Upvotes: 4

Related Questions