yakuelin
yakuelin

Reputation: 13

data.table fread() - first part of document skipped

Good day, I have several text-files with the same layout that i would like to read in with the fread() function. (see two sample files here: https://www.dropbox.com/sh/grpai6ppc6oq3ka/AADyECZHz5KW7wtv5xjF5-ena?dl=0 ) The documents are divided into two parts, the first part contains 16 columns, the second 7 columns. I only want the data of the first part, and only column 1 and 2.

 dat10 <- fread("CalcV10.txt", select = c(1,2), verbose=TRUE, col.names = c("Net", "Nrp"))

> head(dat10)
Net Nrp
1: 225   1
2: 247   1
3: 268   1
4: 287   1
5: 301  12
6: 302   4

This works perfectly for a part of my data (eg. CalcV10), where there is more than just one row of data.

For the other file, that only contains one row of data, however the first part is skipped and the second part of the document is read instead:

> head(dat3)
Net  Nrp
1: 1000      9.9   
2: 1000     14.8   
3: 1000     12.7    
4: 1000     14.8    
5: 1000     11.7    
6: 1000     14.8  

I tried to change the number of rows (colClasses=list(character=1:16)), but that did not help. I'm thankful for every small hint!

Best, yakuelin

I'm using 1.10.4 version of data.table, R version 3.3.2, Version 1.0.136 of R Studio (all of it was updated two weeks ago)

edit

I have 40 files of the same name and layout(Calc.txt). They are in 20 Folders, named V1 - V20, and each have two subfolders named after two sim_types. To read in these text-files i created the following function:

   read.res <- function(NrV, sim_type, FT) {
   dat <- data.frame()
   V <- paste("V", 1:NrV, sep="")

   for (i in 1:NrV) {
   Dir <- file.path(dataDir, V[i], sim_type)
   setwd(Dir)
   dat0 <- fread("Calc.txt", select = c(1,2), col.names = c("Net", "Nrp"))
   dat0$type <- FT
    dat<-rbind(dat, dat0)
   }
  dat<-as.data.frame(dat)
  return(dat) }

  Forest <- read.res(NrV=20, sim_type=sim_F,  FT="F") 
  nonForest <- read.res(NrV=20, sim_type=sim_nF, FT="nF") 
  data <- rbind(Forest, nonForest)

@Sathish's works fine to read one file at a time, but it would be great to automatize this step for all files. I struggle to include Sathish's suggestion into my function. Any idea?

Upvotes: 1

Views: 548

Answers (1)

Sathish
Sathish

Reputation: 12703

library('data.table')
fn1 <- "CalcV3.txt"
fn2 <- "CalcV10.txt"

n1 <- grep('Sim_data', readLines(fn1)) - 5  # get the line number matching Sim_data and subtract 5 to it
x1 <- fread(fn1, nrows = n1, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F)   # get file contents for the n1 rows

n2 <- grep('Sim_data', readLines(fn2)) - 5
x2 <- fread(fn2, nrows = n2, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F)

# split the file contents and convert it to data table
my_func <- function(x, from, to)
{
  y <- strsplit(x, '\ ')   # split string by space
  y <- lapply(y, function(z) as.numeric(z[ z != '' ] )[from:to])   # remove blank characters
  t(rbindlist(l = list( y )))  # combine list elements into data table
}

my_func(x1$V1, 1, 16)   # all columns
#    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
# V1 1000 2100    7   10   11   12  0.9  1.9    2   2.2  12.3  14.8  17.1  42.1 -52.1 -40.1

my_func(x1$V1, 2, 4)  # columns from 2 to 4
#    [,1] [,2] [,3]
# V1 2100    7   10

my_func(x2$V1, 1, 16)  # all columns
#     [,1] [,2]  [,3]  [,4]  [,5]  [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
# V1   225    1 773.1 773.1 773.1 773.1  0.5  0.5  0.5   0.5  21.5  21.5  34.7  34.7 -42.5 -42.5
# V2   247    1 833.5 833.5 833.5 833.5  0.6  0.6  0.6   0.6  20.2  20.2  40.9  40.9 -15.4 -15.4
# V3   268    1 704.4 704.4 704.4 704.4  1.8  1.8  1.8   1.8  20.6  20.6  32.8  32.8 -42.9 -42.9
# V4   287    1 325.1 325.1 325.1 325.1  0.9  0.9  0.9   0.9  14.0  14.0  25.0  25.0 -42.1 -42.1
# V5   301   12 170.8 325.8 437.8 437.8  0.5  0.8  5.9   5.9   9.8  16.3  17.2  27.2 -32.2 -20.2
# V6   302    4  85.0 218.0 218.0 218.0  0.5  0.5  0.5   0.5   6.8  14.9   8.1  15.1 -38.4 -34.4
# V7   303    3  70.5  85.5  85.5  85.5  0.5  0.5  0.5   0.5   6.2   6.4  11.4  12.4 -26.9 -17.9
# V8   316   56 499.1 689.1 728.1 772.1  0.6  1.3  1.8   1.9  15.9  20.9  28.9  36.9 -38.6 -31.6
# V9   317  772 367.5 569.5 618.5 705.5  0.5  0.7  0.9   1.0  13.7  17.9  27.3  35.3 -26.6 -14.6
# V10  318   52 304.2 445.2 511.2 615.2  0.6  1.3  1.8   2.0  12.5  17.8  23.5  34.5 -21.6   0.4
# V11  319    4 412.3 527.3 527.3 527.3  0.6  0.7  0.7   0.7  15.1  20.9  21.9  33.9 -25.8  -4.8
# V12  330   14 107.7 264.7 421.7 421.7  0.5  0.8  1.3   1.3   8.2  14.4  14.7  27.7 -45.7 -27.7
# V13  331  872 229.3 406.3 468.3 531.3  0.5  1.0  1.5   2.3  11.7  17.1  19.2  28.2 -47.5 -37.5
# V14  332   35 428.1 690.1 728.1 774.1  1.1  3.2  4.1   4.8  17.0  22.6  22.6  35.6 -51.3 -35.3
# V15  333    4 452.0 523.0 523.0 523.0  0.7  1.0  1.0   1.0  15.8  17.1  28.5  29.5 -45.9 -38.9
# V16 1000 2100 143.6 200.6 215.6 232.6  1.2  2.1  2.3   2.4  12.4  14.8   8.1  17.1 -52.1 -41.1

EDIT:

# split the file contents and convert it to data table
my_func <- function(x, from, to)
{
  y <- strsplit(x, '\ ')   # split string by space
  y <- lapply(y, function(z) as.numeric(z[ z != '' ] )[from:to])   # remove blank characters
  t(rbindlist(l = list( y )))  # combine list elements into data table
}

root_path <- "temp"   # Set `root_path` variable to a desired location
fdirs <- unlist(lapply(file.path(root_path, c(paste('V', 1:20, sep = ''))),
                       function(x) file.path(x, c(paste('sim_types', 1:2, sep = '')))))

all_dfs <- list()  # this list contains data frames of all files
for ( i in fdirs)
{
  require('data.table')
  fn <- file.path(i, 'Calc.txt')

  if ( file.exists( fn ) ){
    n1 <- grep('Sim_data', readLines(fn)) - 5  # get the line number matching Sim_data and subtract 5 to it
    x1 <- fread(fn, nrows = n1, header = F, skip = 1, sep = '\t', strip.white = F, stringsAsFactors = F)   # get file contents for the n1 rows
    df <- my_func(x1$V1, 1, 2)
    colnames(df) <- c('Net', 'Nrp')
    all_dfs[[fn]] <- df
  } else {
    warning(paste('The file ', fn, ' does not exist!', sep = ''))
  }
}

warnings()
# 38: The file temp/V20/sim_types2/Calc.txt does not exist!

all_dfs
# $`temp/V1/sim_types1/Calc.txt`
# Net  Nrp
# V1 1000 2100
# 
# $`temp/V1/sim_types2/Calc.txt`
# Net  Nrp
# V1 1000 2100
# 
# $`temp/V2/sim_types1/Calc.txt`
# Net  Nrp
# V1 1000 2100
# 
# $`temp/V2/sim_types2/Calc.txt`
# Net  Nrp
# V1 1000 2100

If you want to play around with files and directories, try this reproducible example, which will create directories and files. Set root_path variable to a desired location.

# reproducible example
root_path <- "temp"

dirs <- file.path(root_path, c(paste('V', 1:20, sep = '')))

for(fpath in dirs)
{
  dir.create(path = fpath, recursive = TRUE )

  sub_dirs <- file.path(fpath, c(paste('sim_types', 1:2, sep = '')))
  for( sfpath in sub_dirs){
    dir.create(path = sfpath, recursive = TRUE )
    file.create(file.path(sfpath, 'Calc.txt'))
  }
}

Upvotes: 1

Related Questions