Reputation: 5088
I have created a simple data.tree
through importing a folder structure with files inside of it.
if (!require("pacman")) install.packages("pacman")
pacman::p_load_gh("trinker/pathr")
library(pathr)
library(data.tree)
folder_structure <- pathr::tree(path = "/Users/username/Downloads/top_level/",
use.data.tree = T, include.files = T)
Now, I would like to convert the object folder_structure
into a data.frame
with one row per folder and a column that specifies how many files each folder contains. How can I accomplish this?
For example, I have this very simply folder structure:
top_level_folder
sub_folder_1
file1.txt
sub_folder_2
file2.txt
Answering the question would involve creating an output that looks like this:
Folders Files
top_level_folder 0
sub_folder_1 1
sub_folder_2 1
The first column can simply be generated through calling list.dirs("/Users/username/Downloads/top_level/")
, but I don't know how to generate the second column. Note that the second column is non-recursive, meaning that files within subfolders are not counted (i.e. top_level_folder
contains 0
files, even though the subfolders of top_level_folder
contains 2 files).
If you want to see whether your solution scales or not, download the Rails codebase: https://github.com/rails/rails/archive/master.zip and try it on Rails' more complex file structure.
Upvotes: 4
Views: 1020
Reputation: 1244
Here's a very compact solution:
print(folder_structure,
files = function(node) sum(Get(node$children, 'isLeaf')),
filterFun = isNotLeaf,
pruneMethod = NULL
)
This produces something like this:
levelName files
1 data.tree 16
2 ¦--data 2
3 ¦--data_gen 2
4 ¦--.git 8
5 ¦ ¦--hooks 9
6 ¦ ¦--info 1
7 ¦ ¦--logs 1
8 ¦ ¦ °--refs 1
9 ¦ ¦ ¦--heads 4
10 ¦ ¦ ¦--remotes 0
11 ¦ ¦ ¦ °--origin 5
12 ¦ ¦--objects 0
13 ¦ ¦ ¦--01 4
14 ¦ ¦ ¦--02 5
...
Note, however, that this counts empty folders as files too.
Upvotes: 1
Reputation: 43354
All you really need to do is make a list of directories with list.dirs
(which defaults to recursive = TRUE
) and the iterate over it, finding the length of list.files
(which defaults to recursive = FALSE
) for that directory. Neatening to a nice data.frame,
library(purrr)
files <- .libPaths()[1] %>% # omit for current directory or supply alternate path
list.dirs() %>%
map_df(~list(path = .x,
files = length(list.files(.x))))
files
#> # A tibble: 4,457 x 2
#> path files
#> <chr> <int>
#> 1 /Library/Frameworks/R.framework/Versions/3.4/Resources/library 314
#> 2 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind 9
#> 3 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/help 5
#> 4 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/html 2
#> 5 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/Meta 6
#> 6 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/R 3
#> 7 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack 14
#> 8 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/help 5
#> 9 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/html 2
#> 10 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/acepack/libs 2
#> # ... with 4,447 more rows
or all in base if you prefer,
files <- do.call(rbind, lapply(list.dirs(.libPaths()[1]), function(path){
data.frame(path = path,
files = length(list.files(path)),
stringsAsFactors = FALSE)
}))
head(files)
#> path files
#> 1 /Library/Frameworks/R.framework/Versions/3.4/Resources/library 314
#> 2 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind 9
#> 3 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/help 5
#> 4 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/html 2
#> 5 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/Meta 6
#> 6 /Library/Frameworks/R.framework/Versions/3.4/Resources/library/abind/R 3
Upvotes: 1
Reputation: 5273
list.files
returns all file and directory paths. There is no is.file
function, but there is dir.exists
. Since we know all the paths are actual nodes, those that aren't directories will be counted as files.
top_level <- '~/rails-master'
setwd(top_level)
subitems <- data.frame(
path = list.files(
include.dirs = TRUE,
recursive = TRUE
),
stringsAsFactors = FALSE
)
subitems$is_file <- !dir.exists(subitems$path)
For each row, if the path is to a directory, then it's its own directory path. If the path's for a file, then its parent is the directory path. Then it's simply a matter of counting how often is_file
is true by directory path.
subitems$dir_path <- ifelse(
subitems$is_file,
dirname(subitems$path),
subitems$path
)
file_counts <- tapply(subitems$is_file, subitems$dir_path, sum)
result <- data.frame(
Folders = names(file_counts),
Files = file_counts
)
Upvotes: 0
Reputation: 3964
list.dirs()
provides a vector of every subdirectory reachable from a starting folder, so that handles the first column of your data-frame. Very convenient.
# Get a vector of all the directories and subdirectories from this folder
dir <- "."
xs <- list.dirs(dir, recursive = TRUE)
list.files()
can tell us the contents of each of those folders, but it includes files and folders. We just want the files. To get the count of files, we need to filter the output of list.files()
with a predicate. file.info()
can tell us whether a given file is a directory or not, so we build our predicate from that.
# Helper to check if something is folder or file
is_dir <- function(x) file.info(x)[["isdir"]]
is_file <- Negate(is_dir)
Now, we solve how to get the number of files in a single folder. Summing boolean values returns the number of TRUE
cases.
# Count the files in a single folder
count_files_in_one_dir <- function(dir) {
files <- list.files(dir, full.names = TRUE)
sum(is_file(files))
}
For convenience, we wrap that function to make it work on many folders.
# Vectorized version of the above
count_files_in_dir <- function(dir) {
vapply(dir, count_files_in_one_dir, numeric(1), USE.NAMES = FALSE)
}
Now we can count the files.
df <- tibble::data_frame(
dir = xs,
nfiles = count_files_in_dir(xs))
df
#> # A tibble: 688 x 2
#> dir nfiles
#> <chr> <dbl>
#> 1 . 11
#> 2 ./.github 3
#> 3 ./actioncable 7
#> 4 ./actioncable/app 0
#> 5 ./actioncable/app/assets 0
#> 6 ./actioncable/app/assets/javascripts 1
#> 7 ./actioncable/app/assets/javascripts/action_cable 5
#> 8 ./actioncable/bin 1
#> 9 ./actioncable/lib 1
#> 10 ./actioncable/lib/action_cable 8
#> # ... with 678 more rows
Upvotes: 3
Reputation: 47330
dir.create("top_level_folder")
dir.create("top_level_folder/sub_folder_1")
dir.create("top_level_folder/sub_folder_2")
a <- "hello"
save(a,file = "top_level_folder/sub_folder_1/file1.txt")
save(a,file = "top_level_folder/sub_folder_2/file2.txt")
path <- "top_level_folder"
files <- list.files(path, recursive=TRUE)
folders <- sapply(strsplit(files,"/"),function(x){x[length(x)-1]})
output <- setNames(as.data.frame(table(unlist(folders))),c("Folders","Files"))
all_folders <- data.frame(Folders = list.dirs(path,full.names=FALSE,recursive=TRUE),stringsAsFactors=FALSE)
all_folders$Folders[1] <- strsplit(path,",")[[1]][length(strsplit(path,",")[[1]])]
output <- merge(all_folders,output,all.x = TRUE)
output$Files[is.na(output$Files)] <- 0
output <- output[match(all_folders$Folders,output$Folders),]
# Folders Files
# 3 top_level_folder 0
# 1 sub_folder_1 1
# 2 sub_folder_2 1
Upvotes: 1
Reputation: 1641
You can use a dplyr
chain with the parse_path()
function from the pathr
package. The tree
function is basically just a wrapper around parse_path
so it's easier to use parse_path
directly. E.g. like this:
library(pathr)
library(dplyr)
fls <- dir("C:/RBuildTools/3.3", recursive = T, full.names = T) %>%
parse_path() %>%
index(4) %>% # this is where you indicate the level or "depth"
# of the folder of which want subfolder file counts
data.frame(folders = .) %>%
group_by(folders) %>%
tally() %>%
arrange(n)
# if you want to get rid of all the files in your starting folder
# just add a
# filter(folder > 1) at the end of the dplyr chain
For me the above code produces the following result:
> fls
# A tibble: 12 × 2
folders n
<fctr> <int>
1 COPYING 1
2 README.txt 1
3 Rtools.txt 1
4 unins000.dat 1
5 unins000.exe 1
6 VERSION.txt 1
7 bin 56
8 mingw_libs 200
9 texinfo5 356
10 gcc-4.6.3 3787
11 mingw_32 13707
12 mingw_64 14619
Upvotes: 1