Reputation: 121
I have a very large data frame (7000 columns and 14000) observations. They are in fact greyscale observations of each pixel of an image. 7000px observations on the x-axis and 140000px on the y axis. I'm looking for a way to do the following:
Any ideas on how I might do this would be greatly appreciated
Upvotes: 2
Views: 224
Reputation: 121
Im sure someone better than me has a more elegant solution but this is what I ended up doing to randomly sample each 1000 by 1000 grid in a matrix that was 7000 by 140000. Its long-winded but it works
Setting a min and max value for the random number (1 and 1000) and working row by row to randomly select a row value and a column value and plucking that number out of the matrix.
It does this seven times for the first row (7000/1000). Then adds 1000 to the min and max values for the second row Each time the values are plucked they are stored to "g" and attached to df_new using rbind.
The repeats until I get to row 14 them some tidy up of the data and renaming.
The end is a data frame of 98 values along with the grid number they came from and its x and y coordinates in the matrix
Not very flexible and fixed for matrix 7000 by 14000 and grids that are 1000 by 1000. But I don't need to change this..... yet !!!
My wish list down the track would be a function that lets me set..
Thanks all for your input. Much appreciated.
# MAKE A DUMMY MATRIX
set.seed(123)
df <- data.frame(matrix(runif(7000*14000), 14000))
# PICK 1 RANDOM VALUE FROM EACH 1000 BY 1000 GRID
# STARTING WITH THE TOP ROW (GRIDS 1 TO 7)
# THEN THE NEXT ROW (GRIDS 8 TO 9) ETC ETC
# UNITL GRID 98
set.seed(42)
# 1st row of grids
min=1
max=1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
df_new = c(ran_c, ran_r, df[ran_r,ran_c])
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 2nd row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 3rd row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 4th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 5th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 6th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 7th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 8th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 9th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 10th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 11th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 12th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 13th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# 14th row of grids
min = min + 1000
max = max +1000
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1,1000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,1001,2000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,2001,3000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,3001,4000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,4001,5000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,5001,6000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
ran_r = round(runif(1,min,max),0)
ran_c = round(runif(1,6001,7000),0)
g = c(ran_c, ran_r, df[ran_r,ran_c])
df_new = rbind(df_new, g)
# SOME DATA TIDYING
library(dplyr)
df_new = as.data.frame(df_new) # convert to data frame
df_new = df_new %>% mutate(grid = 1:n()) # add a sequential column named grid
df_new = df_new %>%
relocate(grid)
rownames(df_new)<-c(1:nrow(df_new)) # add row names 1 to 98 or nrow(df_new)
df_new = df_new %>%
rename(
x = 2,
y = 3,
value = 4) # rename columns
# Clear data no longer needed to prepare for next run
rm(g)
rm(max)
rm(min)
rm(ran_c)
rm(ran_r)
Upvotes: 0
Reputation: 102700
If you want to have block matrix, you can try the code below
blks <- t(
sapply(
split(
df,
ceiling(seq(nrow(df)) / 1000)
),
function(x) {
Map(
as.matrix,
split.default(x, ceiling(seq_along(x) / 1000))
)
}
)
)
and you will see
> blks
1 2 3 4
1 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
2 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
3 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
4 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
5 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
6 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
7 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
5 6 7 8
1 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
2 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
3 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
4 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
5 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
6 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
7 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
9 10 11 12
1 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
2 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
3 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
4 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
5 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
6 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
7 Numeric,1000000 Numeric,1000000 Numeric,1000000 Numeric,1000000
13 14
1 Numeric,1000000 Numeric,1000000
2 Numeric,1000000 Numeric,1000000
3 Numeric,1000000 Numeric,1000000
4 Numeric,1000000 Numeric,1000000
5 Numeric,1000000 Numeric,1000000
6 Numeric,1000000 Numeric,1000000
7 Numeric,1000000 Numeric,1000000
> dim(blks[[2,3]])
[1] 1000 1000
> str(blks[[2,3]])
num [1:1000, 1:1000] 0.909 0.833 0.347 0.837 0.58 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:1000] "1001" "1002" "1003" "1004" ...
..$ : chr [1:1000] "X2001" "X2002" "X2003" "X2004" ...
Data
set.seed(123)
df <- data.frame(matrix(runif(7000 * 14000), ncol = 14000))
Upvotes: 0
Reputation: 26238
Follow this strategy
#create a sample dataframe
set.seed(123)
df <- data.frame(matrix(runif(7000*14000), 14000))
#Step-1: create a blank output df say `df2`
df2 <- as.data.frame(matrix(rep(NA, 1000*1000), 1000))
#step-2: for loop to store sampled values in output `df2`
for(i in 0:999){
for(j in 0:999){
df2[i+1, j+1] <- sample(matrix(as.matrix(df[0:13999 %/% 14 == i, 0:6999 %/% 7 == j]),1),1)
}
}
check its dimensions
> dim(df2)
[1] 1000 1000
check its random element to see that loop worked
> df2[5,45]
[1] 0.1724635
Upvotes: 0
Reputation: 432
You can downsample by choosing which random observations you want and then subsetting out your dataframe. For example, the following code will generate a random position in each block, and then subset out the chosen points.
library(tidyverse)
set.seed(42)
df <- data.frame(matrix(runif(7000 * 14000), ncol = 7000, nrow = 14000))
h <- 14
w <- 7
j <- 1000
x_places <- sample.int(j, w, replace = TRUE) + seq(0, (w-1) * j, by = j)
y_places <- sample.int(j, h, replace = TRUE) + seq(0, (h-1) * j, by = j)
new_df <- df[y_places, x_places]
You can change h, j, and k accordingly if you wanted a finer or coarser mesh. Also, if you named the rows and columns of your original dataframe, then you would have the positions by default.
Upvotes: 0