Reputation: 734
I want to create a treemap that shows the folders in a given directory, including all subfolders and files using plotly.graph_objects.Treemap. I understand simple examples like this one and this one.
Problem: I can't figure out how to generate the ids
column to make my figure render properly. I'm going to have duplicate labels
, so I need to use ids
. Right now, the figure renders blank.
Code:
Here's some code to generate a sample directory structure to help you help me:
import os
folder = 'Documents'
for i in range(10):
for j in range(100):
path = os.path.join(folder, f'folder_{i}', f'sub-folder-{j}')
if not os.path.isdir(path):
os.makedirs(path)
for k in range(20):
with open(os.path.join(path, f'file_{k + 1}.txt'), 'w') as file_out:
file_out.write(f'Hello from file {k + 1}!\n')
Here's the code to calculate the files sizes and create the treemap:
import os
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go
directory = '[input your directory here]/Documents'
def calculate_size(folder):
result = []
for root, dirs, files in os.walk(folder):
relpath = Path(root).relative_to(Path(folder).parent)
# Calculate directory size
dir_size = sum(os.path.getsize(os.path.join(root, name)) for name in files)
result.append({
'parents': str(relpath),
'labels': str(Path(root).name),
'size': dir_size,
'ids': str(relpath),
})
# Calculate individual file size
for f in files:
fp = os.path.join(root, f)
relpath_fp = Path(fp).relative_to(Path(folder).parent)
result.append({
'parents': str(relpath_fp),
'labels': str(Path(fp).name),
'size': os.path.getsize(fp),
'ids': str(relpath_fp),
})
return result
result = calculate_size(directory)
df = pd.DataFrame(result)
# Set root
df.loc[df.index == 0, 'parents'] = ""
labels = df['labels'].tolist()
parents = df['parents'].tolist()
ids = df['ids'].tolist()
values = df['size'].tolist()
fig = go.Figure(go.Treemap(
labels = labels,
parents = parents,
ids = ids,
values = values,
# maxdepth=3
))
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()
Upvotes: 2
Views: 1280
Reputation: 18724
You're really close. However, your ids and parents will be different when you have a multi-tiered treemap. Together they create the map for Plotly.
Here I've added another function.
def parPath(idpath):
"""determine if path is top or not, then determine parent path"""
if idpath == os.path.split(directory)[1]:
parpath = idpath
else:
parpath = os.path.split(idpath)[0]
return parpath
This function, parPath
, is called within your function, calculate_size
.
There are four changes in this function (noted with comments).
def calculate_size(folder):
result = []
for root, dirs, files in os.walk(folder):
relpath = Path(root).relative_to(Path(folder).parent)
newpar = parPath(relpath) # determine if parent and id are different
# Calculate directory size
dir_size = sum(os.path.getsize(os.path.join(root, name)) for name in files)
result.append({
'parents': str(newpar), # was str(relpath)
'labels': str(Path(root).name),
'size': dir_size,
'ids': str(relpath),
})
# Calculate individual file size
for f in files:
fp = os.path.join(root, f)
relpath_fp = Path(fp).relative_to(Path(folder).parent)
newpar2 = parPath(relpath_fp) # determine if parent and id are different
result.append({
'parents': str(newpar2), # was str(relpath)
'labels': str(Path(fp).name),
'size': os.path.getsize(fp),
'ids': str(relpath_fp),
})
return result
There is another modification in addition to your call to change the first parent; you'll also change the first id
.
df.loc[df.index == 0, 'ids'] = os.path.split(df.loc[0, 'ids'])[1] # get first folder
You're ready to plot.
fig = go.Figure(go.Treemap(
labels = labels,
parents = parents,
ids = ids,
values = values,
))
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()
Here's a drill down into one of my own folders that I used in testing.
Upvotes: 2
Reputation: 3479
You could use plotly.express to create your treemap. You need to create a new column for each level in the treemap (used a regex to extract that information from the parents column in your dataframe).
df['level1'] = df['parents'].str.replace(pat = '^(.*?)\\\\(.*?)\\\\(.*?)\\\\(.*)', repl = r'\1')
df['level2'] = df['parents'].str.replace(pat = '^(.*?)\\\\(.*?)\\\\(.*?)\\\\(.*)', repl = r'\2')
df['level3'] = df['parents'].str.replace(pat = '^(.*?)\\\\(.*?)\\\\(.*?)\\\\(.*)', repl = r'\3')
df['level4'] = df['parents'].str.replace(pat = '^(.*?)\\\\(.*?)\\\\(.*?)\\\\(.*)', repl = r'\4')
df = df.query("labels.str.contains('.txt')")
fig = px.treemap(df,
title = 'treemap of folder structure',
values = 'size',
path = ['level1', 'level2', 'level3', 'level4'],
maxdepth = 2,
)
Upvotes: 0