Python, weird memory consumption behavior

Question

I have a folder with thousands of .pkl files, each of them containing a list of tuple of objects. The larger file is 8 Gb large, and the smaller is 1 kB (empty list). I'm iterating through the files and loading each .pkl individually. Thus, the maximum memory consumption I should get should be the larger .pkl file. However, it went up to 65 Gb of RAM and I have no clue why...

In the code below, I provided the function creating and reading the pickle files, and the plotting function that I called on a specific folder. I did not provided all the function used, but they should not be the reason behind this issue.

Reason: Function such as PB only work on the file name, a string. => Low memory consumption. PB is only a value thus the list distance should not be too large.

import os
import _pickle as pickle
from matplotlib import pyplot as plt

def write_solutions(solutions, folder_path, file_name):
    """
    Function write a pickle file out of the solutions list.
    This function overwrites any existing file.
    """
    with open(join(folder_path, file_name), "wb") as output:
        pickle.dump(solutions, output, -1)

def read_solutions(folder_path, file_name):
    """
    Function reading a pickle file and returning the solutions list.
    """
    with open(join(folder_path, file_name), "rb") as input:
        solutions = pickle.load(input)
    return solutions

def plotting_solution_space_line_detailed(folder, output, N, max_pb, files = None):
    """
    Function taking .pkl in input and plotting.
    """
    if files == None:
        # Load all the .pkl files
        files = os.listdir(folder)
        files = [elt for elt in files if elt[len(elt)-4:] == ".pkl"]

    data = dict()
    for i in range(2, N+1):
        data[i] = [list(), list(), list(), list(), list(), list()]

    for file in files:
        item = read_solutions(folder, file)
        nfo = file_name_reader(file)
        n = len(nfo[0])
        desired_pb = PB(file)

        if len(item) == 0:
            data[n][3].append(1)
            data[n][2].append(desired_pb)

        else:
            data[n][1].append(1.1)
            data[n][0].append(desired_pb)

            # Computation of the actual closest PB
            distance = [abs(PB(file_namer(elt[0])) - desired_pb) for elt in item]
            i = distance.index(min(distance))
            plot_sol = item[i][0]
            actual_pb = PB(file_namer(plot_sol))

            # Plot of the acutal PB
            data[n][5].append(1.2)
            data[n][4].append(actual_pb)

    empty = list()
    for i in range(2, N+1):
        if len(data[i][0]) == 0 and len(data[i][2]) == 0:
            empty.append(i)

    for elt in empty:
        del data[elt]

    # Creates the figure
    f, ax = plt.subplots(len(data), sharex=True, sharey=True, figsize=(10,5))
    f.subplots_adjust(hspace=0)
    plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
    for a in f.axes:
        a.tick_params(
        axis='y',           # changes apply to the x-axis
        which='both',       # both major and minor ticks are affected
        left='False',
        right='False',
        labelleft='False')    # labels along the left edge are off

        # Shrink the axes
        box = a.get_position()
        a.set_position([box.x0, box.y0, box.width * 0.9, box.height])

        # Add a vertical line at the max budget
        a.axvline(x=max_pb, linestyle= '--',lw = 0.4, color = "black")

    if len(data) > 1:
        for i in range(len(data)):
            key = list(data.keys())[i]
            X = data[key][0]
            Y = data[key][1]
            X2 = data[key][2]
            Y2 = data[key][3]
            X3 = data[key][4]
            Y3 = data[key][5]
            ax[i].scatter(X, Y, s = 3)
            ax[i].scatter(X2, Y2, s = 3, color = "crimson")
            ax[i].scatter(X3, Y3, s = 3, color = "teal")
            ax[i].set_ylim(0.8, 1.4)
            ax[i].set_ylabel("{} Signals".format(key))
            ax[i].text(1.01, 0.6, "Nb with solution(s):
{}".format(len(X)), fontsize=8, transform=ax[i].transAxes)
            ax[i].text(1.01, 0.2, "Nb without solution(s):
{}".format(len([x for x in X2 if x <= max_pb])), fontsize=8, transform=ax[i].transAxes)

    else:
        key = list(data.keys())[0]
        X = data[key][0]
        Y = data[key][1]
        X2 = data[key][2]
        Y2 = data[key][3]
        X3 = data[key][4]
        Y3 = data[key][5]
        ax.scatter(X, Y, s = 3)
        ax.scatter(X2, Y2, s = 3, color = "crimson")
        ax.scatter(X3, Y3, s = 3, color = "teal")
        ax.set_ylim(0.8, 1.4)
        ax.set_ylabel("{} Signals".format(key))
        ax.text(1.01, 0.6, "Nb solutions:
{}".format(len(X)), fontsize=12, transform=ax.transAxes)
        ax.text(1.01, 0.2, "Nb no solutions:
{}".format(len([x for x in X2 if x <= max_pb])), fontsize=8, transform=ax.transAxes)

    f.text(0.5, 0.94, 'Solution space', ha='center')
    f.text(0.5, 0.04, 'PB', ha='center')
    f.text(0.04, 0.5, 'Number of signals', va='center', rotation='vertical')

    plt.savefig("{}.png".format(output), dpi = 500)
    plt.close()

Do you see any reason for such high memory consumption with .pkl files. Is it not as compressed once unpickled in the RAM? Or is it another issue?

deets · Accepted Answer

Your mistake is to assume size equivalence. An integer in a pickle is little more than the bytes needed. But an integer in memory is (some exceptions for small numbers non-withstanding) about 28 bytes.

This is the reason why normally one would use numpy to hold the data, as it’s compact arrays of fixed sized numeric types.

Python, weird memory consumption behavior

Answers (1)

Related Questions