rustymarmot
rustymarmot

Reputation: 121

R large data frame random sub sample based on grid

I have a very large data frame (7000 columns and 14000) observations. They are in fact greyscale observations of each pixel of an image. 7000px observations on the x-axis and 140000px on the y axis. I'm looking for a way to do the following:

Any ideas on how I might do this would be greatly appreciated

Upvotes: 2

Views: 224

Answers (4)

rustymarmot
rustymarmot

Reputation: 121

Im sure someone better than me has a more elegant solution but this is what I ended up doing to randomly sample each 1000 by 1000 grid in a matrix that was 7000 by 140000. Its long-winded but it works

Setting a min and max value for the random number (1 and 1000) and working row by row to randomly select a row value and a column value and plucking that number out of the matrix.

It does this seven times for the first row (7000/1000). Then adds 1000 to the min and max values for the second row Each time the values are plucked they are stored to "g" and attached to df_new using rbind.

The repeats until I get to row 14 them some tidy up of the data and renaming.

The end is a data frame of 98 values along with the grid number they came from and its x and y coordinates in the matrix

Not very flexible and fixed for matrix 7000 by 14000 and grids that are 1000 by 1000. But I don't need to change this..... yet !!!

My wish list down the track would be a function that lets me set..

  • the grid size (x and y) so i can have rectangular grids
  • the number of samples plucked from each grid ( 1, 2, 3, or more)

Thanks all for your input. Much appreciated.

# MAKE A DUMMY MATRIX
set.seed(123)
df <- data.frame(matrix(runif(7000*14000), 14000))

# PICK 1 RANDOM VALUE FROM EACH 1000 BY 1000 GRID
# STARTING WITH THE TOP ROW (GRIDS 1 TO 7)
# THEN THE NEXT ROW (GRIDS 8 TO 9) ETC ETC 
# UNITL GRID 98

set.seed(42)

# 1st row of grids
min=1
max=1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
df_new = c(ran_c, ran_r, df[ran_r,ran_c])

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 2nd row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 3rd row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 4th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 5th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 6th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 7th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 8th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 9th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 10th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 11th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 12th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 13th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

# 14th row of grids
min = min + 1000
max = max +1000

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)

ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)


# SOME DATA TIDYING
library(dplyr)
df_new = as.data.frame(df_new) # convert to data frame
df_new = df_new %>% mutate(grid = 1:n()) # add a sequential column named grid
df_new = df_new %>%
  relocate(grid)
rownames(df_new)<-c(1:nrow(df_new)) # add row names 1 to 98 or nrow(df_new)
df_new = df_new %>%
  rename(
    x = 2,
    y = 3,
    value = 4) # rename columns

# Clear data no longer needed to prepare for next run
rm(g)
rm(max)
rm(min)
rm(ran_c)
rm(ran_r)

Upvotes: 0

ThomasIsCoding
ThomasIsCoding

Reputation: 102700

If you want to have block matrix, you can try the code below

blks <- t(
  sapply(
    split(
      df,
      ceiling(seq(nrow(df)) / 1000)
    ),
    function(x) {
      Map(
        as.matrix,
        split.default(x, ceiling(seq_along(x) / 1000))
      )
    }
  )
)

and you will see

> blks
  1               2               3               4
1 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
2 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
3 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
4 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
5 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
6 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
7 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
  5               6               7               8
1 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
2 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
3 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
4 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
5 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
6 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
7 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
  9               10              11              12
1 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
2 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
3 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
4 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
5 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
6 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
7 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
  13              14
1 Numeric,1000000 Numeric,1000000
2 Numeric,1000000 Numeric,1000000
3 Numeric,1000000 Numeric,1000000
4 Numeric,1000000 Numeric,1000000
5 Numeric,1000000 Numeric,1000000
6 Numeric,1000000 Numeric,1000000
7 Numeric,1000000 Numeric,1000000

> dim(blks[[2,3]])
[1] 1000 1000

> str(blks[[2,3]])
 num [1:1000, 1:1000] 0.909 0.833 0.347 0.837 0.58 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:1000] "1001" "1002" "1003" "1004" ...
  ..$ : chr [1:1000] "X2001" "X2002" "X2003" "X2004" ...

Data

set.seed(123)
df <- data.frame(matrix(runif(7000 * 14000), ncol = 14000))

Upvotes: 0

AnilGoyal
AnilGoyal

Reputation: 26238

Follow this strategy

#create a sample dataframe
set.seed(123)
df <- data.frame(matrix(runif(7000*14000), 14000))

#Step-1: create a blank output df say `df2`
df2 <- as.data.frame(matrix(rep(NA, 1000*1000), 1000))
#step-2: for loop to store sampled values in output `df2`
for(i in 0:999){
  for(j in 0:999){
    df2[i+1, j+1] <- sample(matrix(as.matrix(df[0:13999 %/% 14 == i, 0:6999 %/% 7 == j]),1),1)
  }
}

check its dimensions

> dim(df2)
[1] 1000 1000

check its random element to see that loop worked

> df2[5,45]
[1] 0.1724635

Upvotes: 0

Carey Caginalp
Carey Caginalp

Reputation: 432

You can downsample by choosing which random observations you want and then subsetting out your dataframe. For example, the following code will generate a random position in each block, and then subset out the chosen points.

library(tidyverse)
set.seed(42)
df <- data.frame(matrix(runif(7000 * 14000), ncol = 7000, nrow = 14000))
h <- 14
w <- 7
j <- 1000
x_places <- sample.int(j, w, replace = TRUE) + seq(0, (w-1) * j, by = j)
y_places <- sample.int(j, h, replace = TRUE) + seq(0, (h-1) * j, by = j)
new_df <- df[y_places, x_places]

You can change h, j, and k accordingly if you wanted a finer or coarser mesh. Also, if you named the rows and columns of your original dataframe, then you would have the positions by default.

Upvotes: 0

Related Questions