Reputation: 1004
I'm trying to write a program that takes a command line argument, scans through the directory tree provided by the argument and creating a list of every file in the directory, and then sorting by length of files.
I'm not much of a script-guy - but this is what I've got and it's not working:
import sys
import os
from os.path import getsize
file_list = []
#Get dirpath
dirpath = os.path.abspath(sys.argv[0])
if os.path.isdir(dirpath):
#Get all entries in the directory
for root, dirs, files in os.walk(dirpath):
for name in files:
file_list.append(name)
file_list = sorted(file_list, key=getsize)
for item in file_list:
sys.stdout.write(str(file) + '\n')
else:
print "not found"
Can anyone point me in the right direction?
Upvotes: 22
Views: 33902
Reputation: 851
Plain old os.listdir but with extra params
import os
import operator
def oslistdir(directory='.', sort=True, size='smallest_first'):
all_files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
files_and_sizes = ((os.path.join(directory, file), os.path.getsize(os.path.join(directory, file))) for file in all_files)
if sort:
sorted_files_with_size = sorted(files_and_sizes, key=operator.itemgetter(1), reverse=(size == 'largest_first'))
sorted_files = [os.path.basename(file[0]) for file in sorted_files_with_size]
else:
sorted_files = [os.path.basename(file) for file in all_files]
return sorted_files
Smallest first:
[f for f in oslistdir(my_folder_path, sort=True, size='smallest_first')]
Largest first:
[f for f in oslistdir(my_folder_path, sort=True, size='largest_first')]
Upvotes: 0
Reputation: 31
I think this is what you are looking for:
import numpy as np
import os,glob
for file in sorted(glob.glob("*.file extension"),key=lambda file:os.stat(file).st_size,reverse=True):
print(f'{file} is {np.around(os.stat(file).st_size/(1024),decimals=1)} KB')
Upvotes: 2
Reputation: 1894
How about using pandas?
import pandas as pd
import os
file_paths = [os.path.join(files_dir, file_name) for file_name in os.listdir(files_dir)]
file_sizes = [os.path.getsize(file_path) for file_path in file_paths]
df = pd.DataFrame({'file_path': file_paths, 'file_size': file_sizes}).sort_values('file_size', ascending = False)
You can then easily recuperate the list of values from the df
.
Upvotes: 2
Reputation: 1882
This is a approach using generators. Should be faster for large number of files…
This is the beginning of both examples:
import os, operator, sys
dirpath = os.path.abspath(sys.argv[0])
# make a generator for all file paths within dirpath
all_files = ( os.path.join(basedir, filename) for basedir, dirs, files in os.walk(dirpath) for filename in files )
If you just want a list of the files without the size, you can use this:
sorted_files = sorted(all_files, key = os.path.getsize)
But if you want files and paths in a list, you can use this:
# make a generator for tuples of file path and size: ('/Path/to/the.file', 1024)
files_and_sizes = ( (path, os.path.getsize(path)) for path in all_files )
sorted_files_with_size = sorted( files_and_sizes, key = operator.itemgetter(1) )
Upvotes: 23
Reputation: 3374
You are extracting the command and not the first argument with argv[0]
; use argv[1]
for that:
dirpath = sys.argv[1] # argv[0] contains the command itself.
For performance reasons I suggest you prefetch the file sizes instead of asking the OS about the size of the same file multiple times during the sorting (as suggested by Koffein, os.walk
is the way to go):
files_list = []
for path, dirs, files in os.walk(dirpath)):
files_list.extend([(os.path.join(path, file), getsize(os.path.join(path, file))) for file in files])
Assuming you don't need the unsorted list, we will use the in-place sort() method:
files_list.sort(key=operator.itemgetter(1))
Upvotes: 2
Reputation: 852
Hopefully this function will help you out (I'm using Python 2.7):
import os
def get_files_by_file_size(dirname, reverse=False):
""" Return list of file paths in directory sorted by file size """
# Get list of files
filepaths = []
for basename in os.listdir(dirname):
filename = os.path.join(dirname, basename)
if os.path.isfile(filename):
filepaths.append(filename)
# Re-populate list with filename, size tuples
for i in xrange(len(filepaths)):
filepaths[i] = (filepaths[i], os.path.getsize(filepaths[i]))
# Sort list by file size
# If reverse=True sort from largest to smallest
# If reverse=False sort from smallest to largest
filepaths.sort(key=lambda filename: filename[1], reverse=reverse)
# Re-populate list with just filenames
for i in xrange(len(filepaths)):
filepaths[i] = filepaths[i][0]
return filepaths
Upvotes: 17