Zilberman Rafael
Zilberman Rafael

Reputation: 1451

Python deepcopy uses more memory than needed

Recently I came across strange memory usage while using copy.deepcopy.

I have the following code example:

import copy
import gc
import os

import psutil
from pympler.asizeof import asizeof
from humanize import filesize


class Foo(object):
    __slots__ = ["name", "foos", "bars"]

    def __init__(self, name):
        self.name = name
        self.foos = {}
        self.bars = {}

    def add_foo(self, foo):
        self.foos[foo.name] = foo

    def add_bar(self, bar):
        self.bars[bar.name] = bar

    def __getstate__(self):
        return {k: getattr(self, k) for k in self.__slots__}

    def __setstate__(self, state):
        for k, v in state.items():
            setattr(self, k, v)


class Bar(object):
    __slots__ = ["name", "description"]

    def __init__(self, name, description):
        self.name = name
        self.description = description

    def __getstate__(self):
        return {k: getattr(self, k) for k in self.__slots__}

    def __setstate__(self, state):
        for k, v in state.items():
            setattr(self, k, v)

def get_ram():
    return psutil.Process(os.getpid()).memory_info()[0]

def get_foo():
    sub_foo = Foo("SubFoo1")
    for i in range(5000):
        sub_foo.add_bar(Bar("BarInSubFoo{}".format(i), "BarInSubFoo{}".format(i)))
    foo = Foo("Foo")
    foo.add_foo(sub_foo)
    for i in range(5000):
        foo.add_bar(Bar("BarInFoo{}".format(i), "BarInFoo{}".format(i)))

    return foo

def main():
    foo = get_foo()
    foo_size = asizeof(foo)

    gc.collect()
    ram1 = get_ram()

    foo_copy = copy.deepcopy(foo)

    gc.collect()
    ram2 = get_ram()
    foo_copy_size = asizeof(foo_copy)
    print("Original object size: {}, Ram before: {}\nCopied object size: {}, Ram after: {}\nDiff in ram: {}".format(
        filesize.naturalsize(foo_size), filesize.naturalsize(ram1), filesize.naturalsize(foo_copy_size),
        filesize.naturalsize(ram2), filesize.naturalsize(ram2-ram1)
    ))

if __name__ == "__main__":
    main()

What I tried to do, is to test the amount of memory used by the program before and after the copy.deepcopy. For this purpose, I created two classes. I expected my memory usage to rise after the call to deepcopy in an amount equal to the size of the original object. Strangly I got these results:

Original object size: 2.1 MB, Ram before: 18.6 MB
Copied object size: 2.1 MB, Ram after: 24.7 MB
Diff in ram: 6.1 MB

As you can see the difference in memory usage is aprox. 300% the size of the copied object.

** These results has been obtained using Python 3.8.5 on Windows 10 64 bit

What I tried?

Original object size: 2.3 MB, Ram before: 34.3 MB
Copied object size: 2.3 MB, Ram after: 46.2 MB
Diff in ram: 11.9 MB

Any toughts?

Upvotes: 6

Views: 2163

Answers (1)

juanpa.arrivillaga
juanpa.arrivillaga

Reputation: 95957

Some of that is probably accounted for because deepcopy keeps a cache of all the objects it has visited to avoid getting stuck in an infinite loop (a set I'm pretty sure). For this sort of thing, you should probably write your own efficient copy function. deepcopy is written to be able to handle arbitrary inputs, not necessarily to be efficient.

If you want an efficient copying function, you can just write it yourself. This is sufficient for a deep copy, something to the effect of:

import copy
import gc
import os

import psutil
from pympler.asizeof import asizeof
from humanize import filesize


class Foo(object):
    __slots__ = ["name", "foos", "bars"]

    def __init__(self, name):
        self.name = name
        self.foos = {}
        self.bars = {}

    def add_foo(self, foo):
        self.foos[foo.name] = foo

    def add_bar(self, bar):
        self.bars[bar.name] = bar

    def copy(self):
        new = Foo(self.name)
        new.foos = {k:foo.copy() for k, foo in self.foos.items()}
        new.bars = {k:bar.copy() for k, bar in self.bars.items()}
        return new

class Bar(object):
    __slots__ = ["name", "description"]

    def __init__(self, name, description):
        self.name = name
        self.description = description

    def copy(self):
        return Bar(self.name, self.description)

def get_ram():
    return psutil.Process(os.getpid()).memory_info()[0]

def get_foo():
    sub_foo = Foo("SubFoo1")
    for i in range(5000):
        sub_foo.add_bar(Bar("BarInSubFoo{}".format(i), "BarInSubFoo{}".format(i)))
    foo = Foo("Foo")
    foo.add_foo(sub_foo)
    for i in range(5000):
        foo.add_bar(Bar("BarInFoo{}".format(i), "BarInFoo{}".format(i)))

    return foo

def main():
    foo = get_foo()
    foo_size = asizeof(foo)

    gc.collect()
    ram1 = get_ram()

    foo_copy = foo.copy()

    gc.collect()
    ram2 = get_ram()
    foo_copy_size = asizeof(foo_copy)
    print("Original object size: {}, Ram before: {}\nCopied object size: {}, Ram after: {}\nDiff in ram: {}".format(
        filesize.naturalsize(foo_size), filesize.naturalsize(ram1), filesize.naturalsize(foo_copy_size),
        filesize.naturalsize(ram2), filesize.naturalsize(ram2-ram1)
    ))

if __name__ == "__main__":
    main()

Upvotes: 7

Related Questions