Reputation: 5746
I have a folder with thousands of .pkl
files, each of them containing a list of tuple of objects. The larger file is 8 Gb large, and the smaller is 1 kB (empty list). I'm iterating through the files and loading each .pkl individually. Thus, the maximum memory consumption I should get should be the larger .pkl
file. However, it went up to 65 Gb of RAM and I have no clue why...
In the code below, I provided the function creating and reading the pickle files, and the plotting function that I called on a specific folder. I did not provided all the function used, but they should not be the reason behind this issue.
Reason: Function such as PB only work on the file name, a string. => Low memory consumption. PB is only a value thus the list distance should not be too large.
import os
import _pickle as pickle
from matplotlib import pyplot as plt
def write_solutions(solutions, folder_path, file_name):
"""
Function write a pickle file out of the solutions list.
This function overwrites any existing file.
"""
with open(join(folder_path, file_name), "wb") as output:
pickle.dump(solutions, output, -1)
def read_solutions(folder_path, file_name):
"""
Function reading a pickle file and returning the solutions list.
"""
with open(join(folder_path, file_name), "rb") as input:
solutions = pickle.load(input)
return solutions
def plotting_solution_space_line_detailed(folder, output, N, max_pb, files = None):
"""
Function taking .pkl in input and plotting.
"""
if files == None:
# Load all the .pkl files
files = os.listdir(folder)
files = [elt for elt in files if elt[len(elt)-4:] == ".pkl"]
data = dict()
for i in range(2, N+1):
data[i] = [list(), list(), list(), list(), list(), list()]
for file in files:
item = read_solutions(folder, file)
nfo = file_name_reader(file)
n = len(nfo[0])
desired_pb = PB(file)
if len(item) == 0:
data[n][3].append(1)
data[n][2].append(desired_pb)
else:
data[n][1].append(1.1)
data[n][0].append(desired_pb)
# Computation of the actual closest PB
distance = [abs(PB(file_namer(elt[0])) - desired_pb) for elt in item]
i = distance.index(min(distance))
plot_sol = item[i][0]
actual_pb = PB(file_namer(plot_sol))
# Plot of the acutal PB
data[n][5].append(1.2)
data[n][4].append(actual_pb)
empty = list()
for i in range(2, N+1):
if len(data[i][0]) == 0 and len(data[i][2]) == 0:
empty.append(i)
for elt in empty:
del data[elt]
# Creates the figure
f, ax = plt.subplots(len(data), sharex=True, sharey=True, figsize=(10,5))
f.subplots_adjust(hspace=0)
plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
for a in f.axes:
a.tick_params(
axis='y', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
left='False',
right='False',
labelleft='False') # labels along the left edge are off
# Shrink the axes
box = a.get_position()
a.set_position([box.x0, box.y0, box.width * 0.9, box.height])
# Add a vertical line at the max budget
a.axvline(x=max_pb, linestyle= '--',lw = 0.4, color = "black")
if len(data) > 1:
for i in range(len(data)):
key = list(data.keys())[i]
X = data[key][0]
Y = data[key][1]
X2 = data[key][2]
Y2 = data[key][3]
X3 = data[key][4]
Y3 = data[key][5]
ax[i].scatter(X, Y, s = 3)
ax[i].scatter(X2, Y2, s = 3, color = "crimson")
ax[i].scatter(X3, Y3, s = 3, color = "teal")
ax[i].set_ylim(0.8, 1.4)
ax[i].set_ylabel("{} Signals".format(key))
ax[i].text(1.01, 0.6, "Nb with solution(s):\n{}".format(len(X)), fontsize=8, transform=ax[i].transAxes)
ax[i].text(1.01, 0.2, "Nb without solution(s):\n{}".format(len([x for x in X2 if x <= max_pb])), fontsize=8, transform=ax[i].transAxes)
else:
key = list(data.keys())[0]
X = data[key][0]
Y = data[key][1]
X2 = data[key][2]
Y2 = data[key][3]
X3 = data[key][4]
Y3 = data[key][5]
ax.scatter(X, Y, s = 3)
ax.scatter(X2, Y2, s = 3, color = "crimson")
ax.scatter(X3, Y3, s = 3, color = "teal")
ax.set_ylim(0.8, 1.4)
ax.set_ylabel("{} Signals".format(key))
ax.text(1.01, 0.6, "Nb solutions:\n{}".format(len(X)), fontsize=12, transform=ax.transAxes)
ax.text(1.01, 0.2, "Nb no solutions:\n{}".format(len([x for x in X2 if x <= max_pb])), fontsize=8, transform=ax.transAxes)
f.text(0.5, 0.94, 'Solution space', ha='center')
f.text(0.5, 0.04, 'PB', ha='center')
f.text(0.04, 0.5, 'Number of signals', va='center', rotation='vertical')
plt.savefig("{}.png".format(output), dpi = 500)
plt.close()
Do you see any reason for such high memory consumption with .pkl
files. Is it not as compressed once unpickled in the RAM? Or is it another issue?
Upvotes: 1
Views: 161
Reputation: 6395
Your mistake is to assume size equivalence. An integer in a pickle is little more than the bytes needed. But an integer in memory is (some exceptions for small numbers non-withstanding) about 28 bytes.
This is the reason why normally one would use numpy to hold the data, as it’s compact arrays of fixed sized numeric types.
Upvotes: 1