Paul
Paul

Reputation: 4450

Search specific folder and sort directories by size, modification date, or path

This Bash script searches for directories named node_modules (or a specified folder) within the current working directory and categorizes them based on their size, last modification date, and path.

The problem is that sorting is not working, especially by the size of the files. Sorting by size must be decreasing from the largest to the smallest.

Bash version:

#!/bin/bash

start_time=$(date +%s.%N)

find_dir="node_modules"
sort_by="path"

while [[ "$1" =~ ^- ]]; do
  case $1 in
    -t|--target)
      find_dir="$2"
      shift 2
      ;;
    -s|--sort)
      sort_by="$2"
      shift 2
      ;;
    *)
      echo "Invalid option: $1"
      exit 1
      ;;
  esac
done

dirs=$(find $(pwd) -type d -name "$find_dir" 2>/dev/null)
json="\"paths\": ["

total_size_kb=0

declare -a results

for dir in $dirs; do
  parent_dir=$(dirname "$dir")
  
  if [[ ! "$parent_dir" =~ /$find_dir/ ]]; then
    last_mod=$(stat -f "%Sm" -t "%d/%m/%Y %H:%M:%S" "$dir")
    size_kb=$(du -sk "$dir" | awk '{print $1}')
    
    total_size_kb=$((total_size_kb + size_kb))

    size_mb=$(echo "scale=2; $size_kb/1024" | bc)
    
    if (( $(echo "$size_mb < 1" | bc -l) )); then
      size=$(echo "scale=2; $size_kb" | bc)
      size="${size} KB"
    elif (( $(echo "$size_mb >= 1024" | bc -l) )); then
      size=$(echo "scale=2; $size_mb/1024" | bc)
      size="${size} GB"
    else
      size="${size_mb} MB"
    fi

    results+=("{\"path\": \"$dir\", \"last_mod\": \"$(date -r "$dir" -u +%dd)\", \"size\": \"$size\"}")
  fi
done

if [[ "$sort_by" == "size" ]]; then
  results=$(for r in "${results[@]}"; do echo "$r"; done | sort -t '"' -k 10 -n -r)
elif [[ "$sort_by" == "path" ]]; then
  results=$(for r in "${results[@]}"; do echo "$r"; done | sort -t '"' -k 4)
elif [[ "$sort_by" == "last-mod" ]]; then
  results=$(for r in "${results[@]}"; do echo "$r"; done | sort -t '"' -k 8)
fi

json="${json}$(echo "$results" | tr '\n' ',' | sed 's/,$//')"

json="${json}]"

end_time=$(date +%s.%N)
elapsed_time=$(echo "$end_time - $start_time" | bc)

total_size_mb=$(echo "scale=2; $total_size_kb/1024" | bc)

json="{
  \"releasable_space\": \"${total_size_mb} MB\", 
  \"search_completed\": \"$(echo $elapsed_time | cut -d'.' -f1)s\",
  ${json}
}"

echo "$json"

Js version:

const fs = require('fs').promises;
const path = require('path');
const { execSync } = require('child_process');

const startTime = process.hrtime.bigint();
let targetDir = 'node_modules';
let sortBy = 'path';
let searchPath = '.';
let maxDepth = 5;
let saveFile = null;
let noMaxDepth = false;

const args = process.argv.slice(2);
args.forEach((arg, i) => {
  if (arg === '-t' || arg === '--target') targetDir = args[i + 1];
  if (arg === '-s' || arg === '--sort') sortBy = args[i + 1];
  if (arg === '-p' || arg === '--path') searchPath = args[i + 1];
  if (arg === '--save') saveFile = args[i + 1];
  if (arg === '--maxDepth') maxDepth = parseInt(args[i + 1], 10);
  if (arg === '--noMaxDepth') noMaxDepth = true;
});

(async () => {
  if (isNaN(maxDepth) || maxDepth < 0) {
    console.error("Error: Invalid value for --maxDepth. It should be a positive integer.");
    process.exit(1);
  }

  const validSortOptions = ['path', 'size', 'last-mod'];
  if (!validSortOptions.includes(sortBy)) {
    console.error(`Error: Invalid sort option. Valid options are: ${validSortOptions.join(", ")}`);
    process.exit(1);
  }

  try {
    await fs.access(searchPath);
  } catch (error) {
    console.error(`Error: The search path '${searchPath}' does not exist or is not accessible.`);
    process.exit(1);
  }

  if (noMaxDepth) maxDepth = Infinity;

  const formatDate = (date) => {
    const day = String(date.getDate()).padStart(2, '0');
    const month = String(date.getMonth() + 1).padStart(2, '0');
    const year = date.getFullYear();
    const hours = String(date.getHours()).padStart(2, '0');
    const minutes = String(date.getMinutes()).padStart(2, '0');
    const seconds = String(date.getSeconds()).padStart(2, '0');
    return `${day}/${month}/${year} ${hours}:${minutes}:${seconds}`;
  };

  const formatSize = (sizeInKB) => {
    if (sizeInKB < 1024) {
      return `${sizeInKB} KB`;
    }

    const sizeInMB = sizeInKB / 1024;
    if (sizeInMB < 1024) {
      return `${sizeInMB.toFixed(2)} MB`;
    }

    const sizeInGB = sizeInMB / 1024;
    return `${sizeInGB.toFixed(2)} GB`;
  };

  const formatTime = (elapsedTime) => {
    const seconds = Number(elapsedTime) / 1e9;
    if (seconds < 60) {
      return `${seconds.toFixed(2)}s`;
    }

    const minutes = seconds / 60;
    if (minutes < 60) {
      return `${minutes.toFixed(2)}m`;
    }

    const hours = minutes / 60;
    return `${hours.toFixed(2)}h`;
  };

  const getDaysDifference = (lastModDate) => {
    const today = new Date();
    const diffTime = today - new Date(lastModDate);
    const diffDays = Math.floor(diffTime / (1000 * 3600 * 24));
    return diffDays;
  };

  const getDirectories = async (dirPath, depth = 0) => {
    if (maxDepth !== Infinity && depth > maxDepth) return [];

    const dirs = [];
    try {
      const items = await fs.readdir(dirPath);
      for (const item of items) {
        const fullPath = path.resolve(dirPath, item);
        try {
          const stat = await fs.stat(fullPath);
          if (stat.isDirectory()) {
            dirs.push(fullPath);
            if (!fullPath.includes(targetDir)) {
              dirs.push(...await getDirectories(fullPath, depth + 1));
            }
          }
        } catch (e) {
          //console.error(`Error reading directory ${fullPath}:`, e);
        }
      }
    } catch (err) {
      //console.error(`Error reading path ${dirPath}:`, err);
    }
    return dirs;
  };

  const getDirInfo = async (dir) => {
    let size = 0;
    const isWindows = process.platform === 'win32';

    if (isWindows) {
      try {
        const result = execSync(`dir /s /a "${dir}"`).toString();
        const match = result.match(/bytes free/g);
        size = match ? parseInt(match[0].split(" ")[0]) : 0;
      } catch (e) {
        console.error(`Error calculating size for ${dir}:`, e);
      }
    } else {
      try {
        const result = execSync(`du -sk "${dir}"`).toString();
        size = parseInt(result.split("\t")[0]);
      } catch (e) {
        console.error(`Error calculating size for ${dir}:`, e);
      }
    }

    const lastMod = await fs.stat(dir);
    const lastModDate = lastMod.mtime;
    const lastModDay = getDaysDifference(lastModDate);

    return { path: dir, size, lastMod: formatDate(lastModDate), lastModDay };
  };

  const spinner = ["⠙", "⠘", "⠰", "⠴", "⠤", "⠦", "⠆", "⠃", "⠋", "⠉"];
  let spinIndex = 0;

  const displaySpinner = () => {
    process.stdout.write(`\rLoading ${spinner[spinIndex]} `);
    spinIndex = (spinIndex + 1) % spinner.length;
  };

  const spinnerInterval = setInterval(displaySpinner, 100);

  const dirs = (await getDirectories(searchPath))
    .filter((dir) => dir.includes(targetDir))
    .map(async (dir) => await getDirInfo(dir));

  const resolvedDirs = await Promise.all(dirs);

  const totalSizeKB = resolvedDirs.reduce((acc, dir) => acc + dir.size, 0);

  const sortedDirs = resolvedDirs.sort((a, b) => {
    if (sortBy === 'size') return b.size - a.size;
    if (sortBy === 'path') return a.path.localeCompare(b.path);
    if (sortBy === 'last-mod') return new Date(b.lastMod) - new Date(a.lastMod);
    return 0;
  });

  const resultJson = {
    releasable_space: formatSize(totalSizeKB),
    search_completed: formatTime(process.hrtime.bigint() - startTime),
    num_paths: sortedDirs.length,
    paths: sortedDirs.map(dir => ({
      path: dir.path,
      last_mod: dir.lastMod,
      last_mod_day: dir.lastModDay,
      size: formatSize(dir.size)
    }))
  };

  clearInterval(spinnerInterval);
  console.log();
  console.clear();

  if (saveFile) {
    try {
      await fs.writeFile(saveFile, JSON.stringify(resultJson, null, 2), 'utf-8');
      console.log(`Results saved to ${saveFile}`);
      console.log({
        releasable_space: resultJson.releasable_space,
        search_completed: resultJson.search_completed,
        num_paths: resultJson.num_paths
      });
    } catch (error) {
      console.error(`Failed to save results to ${saveFile}`, error);
    }
  } else {
    console.log(resultJson);
  }
})();

Use:

node main.js --sort size --path . --save jsonFile.json

Upvotes: 0

Views: 126

Answers (1)

Fravadona
Fravadona

Reputation: 17216

Here's an attempt at refactoring your code with Python 2/3. The dependencies are part of the Standard Library so they're available with any Python installation:

import os, sys, fnmatch, time, json, argparse

The downside of not using any external libraries (on top of being compatible with Python 2 & 3) is that you have to reinvent the wheel. For example "humanizing" a size in bytes or recursively "finding" the files in a directory:

def humanize_date(timestamp):
    return time.strftime("%d/%m/%Y %T", time.localtime(timestamp))

def humanize_size(size):
    size = float(size);
    for unit in ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"):
        if size < 1024.0:
           return ("%d %s" if size.is_integer() else "%.2f %s") % (size, unit)
        size /= 1024.0

def find(path, name = "*"):
    if os.path.lexists(path):
        if fnmatch.fnmatch(os.path.basename(path), name):
            yield path
        if not os.path.islink(path) or path.endswith(os.sep):
            for (rootpath, dirnames, filenames) in os.walk(path):
                for direntry in (dirnames + filenames):
                    if fnmatch.fnmatch(direntry, name):
                        yield os.path.join(rootpath, direntry)

Then comes the most important function for implementing the logic; it takes a directory as argument and returns a dict inspired from os.stat_result with its st_size and st_mtime keys changed to "the sum of the size of all files in the directory" and "the modification time of the most recently modified file" respectively:

def dstat(path):
    result = None
    for direntry in find(path):
        stats = os.lstat(direntry)
        if result == None:
            result = {k: getattr(stats, k) for k in dir(stats) if k.startswith("st_")}
            continue
        result["st_size"] += stats.st_size
        if stats.st_mtime > result["st_mtime"]:
            result["st_mtime"] = stats.st_mtime
    return result

note: dstat stands for "directory stat" and also "dict stat"


Now the "main program" just needs to parse the command-line, sort the results and output a JSON:

cli = argparse.ArgumentParser(description='Dummy npkill implementation that outputs JSON')
cli.add_argument('-d', '--directory', default='.', help='Set the directory from which to begin searching (defaults to ".")')
cli.add_argument('-s', '--sort', required=False, choices=['size', 'path', 'last-mod'], help='Sort results by: "size", "path" or "last-mod"')
cli.add_argument('-t', '--target', default='node_modules', help='Specify the name of the directories you want to search (defaults to "node_modules")')

args = cli.parse_args()

results = [ (p, dstat(p)) for p in find(args.directory, name=args.target) ]

if args.sort != None:
    sort_key = (
        (lambda path_dstat: path_dstat[0]            ) if args.sort == 'path' else
        (lambda path_dstat: path_dstat[1]["st_size"] ) if args.sort == 'size' else
        (lambda path_dstat: path_dstat[1]["st_mtime"])
    )
    results = sorted(results, key = sort_key)

results = [
    {
        "path": path,
        "last_mod": humanize_date(stats["st_mtime"]),
        "size": humanize_size(stats["st_size"]),
    }
    for path, stats in results
]

print(json.JSONEncoder().encode(results))

A few thoughts

The problem that you have with the sorting of the dates is that you're trying to compare strings that do not reflect the correct ordering; for eg. why would 21/01/2003 be "lesser" than 20/12/2024? You need to use use numbers (seconds since EPOCH) for the comparisons and convert them to your date format after the sorting.

A difference I can see between du -sb and dstat_result["st_size"] is that my dstat will sum the size of hard-linked files while du won't.

I didn't implement the elapsed time nor the recoverable size, as it isn't part of the main logic required by the program; though I still added the argument parsing ;-)

Upvotes: 4

Related Questions