Global variables slowing down PyParsing

Question

When using Pyparsing to parse a file it's taking around a minute to parse Ex1 and around 15s to parse Ex2. The only difference is that in Ex1 the variables used by PyParsing are declared as globals before the class. In Ex2 the variables are declared inside each method individually (ex. def parse_components(self)). Is this expected? If yes, what's the explanation?

Ex1 (parsing time: ~ 60s):

import modules
import pyparsing as pp
from collections import defaultdict
from multiprocessing import (Process, Manager, Event)

# GLOBALS for PyParsing
EOL = pp.LineEnd().suppress()
linebreak = pp.Suppress(";" + pp.LineEnd())
identifier = pp.Word(pp.alphanums + '._“!<>/[]$#$%&‘*+,/:<=>?@[\]^_`{|}~')
number = pp.pyparsing_common.number
word = pp.Word(pp.alphas)
LPAR = pp.Suppress('(')
RPAR = pp.Suppress(')')
ORIENT = (pp.Keyword('N')
        | pp.Keyword('S')
        | pp.Keyword('E')
        | pp.Keyword('W')
        | pp.Keyword('FN')
        | pp.Keyword('FS')
        | pp.Keyword('FE')
        | pp.Keyword('FW'))
pt = LPAR + pp.OneOrMore(number | pp.Keyword('*')) + RPAR  # pair of x,y

class DEF():
    def __init__(self, Base):
        self.mydict = lambda: defaultdict(self.mydict)
        ...
        ...
        self.ignore_nets = True
        self.ignore_nets_route = False
        # Each list is a new process. Careful with dependencies.
        # eg. 'dbuPerMicron' must be executed before the others, but it can be after 'design'
        self.sections_grp = [['design', 'dbuPerMicron', 'diearea'],
                             ['components'],
                            ]
        if not self.ignore_nets:
            self.sections_grp.append(['nets'])
        self.n_elems_sections_grp = sum([len(x) for x in self.sections_grp])
        self.counter = modules.SharedCounter(0)
        self.events = [Event()]
        self.design = ''
        modules.debug_log.print_repr([self.__repr__()])

    #
    def run(self):
        for curr_file in self.def_file_design:
            ifile = open(curr_file,'r')
            file_string = ifile.read()
            ifile.close()
            self.parse_all(file_string)

    # Create a process for each section to parse
    def parse_all(self, file_string):
        manager = Manager()
        shared_dict = manager.dict()
        jobs = []
        for sections in self.sections_grp:
            p = Process(target=self.parse_sections, args=(sections, file_string, shared_dict))
            jobs.append(p)
            p.start()

        # Wait for the workers to finish
        for job in jobs:
            job.join()

        for sections in self.sections_grp:
            for section in sections:
                getattr(self, 'handle_' + section)(shared_dict)


    # Spawn the processes from each group of self.sections_grp
    def parse_sections(self, sections, file_string, shared_dict):
        for section in sections:
            to_parse = getattr(self, 'parse_' + section)
            for t, s, e in to_parse().scanString(file_string):
                shared_dict.update(t.asDict())
                break

    # Parse the DESIGN section of a .DEF file
    def parse_design(self):
        ...
        return design

    # Parse the UNITS DISTANCE MICRONS section of a .DEF file
    def parse_dbuPerMicron(self):
        ...
        return dbuPerMicron

    # Parse the DIEAREA section of a .DEF file
    def parse_diearea(self):
        ...
        return diearea

    # Parse the COMPONENTS section of a .DEF file
    def parse_components(self):
        self.events[0].wait()  # Wait for event[0] to finish
        components_id = pp.Keyword('COMPONENTS')
        end_components_id = pp.Keyword("END COMPONENTS").suppress()
        begin_comp = pp.Suppress(pp.Keyword('-'))
        ws_comp = pp.Suppress(pp.Keyword('+'))  # parameter division in components

        # compName
        compName = (identifier('comp_name') + identifier('cell')
                   ).setResultsName('compName')

        ...
        ...
        ...

        subcomponent = pp.Group(begin_comp
                                + compName
                                + pp.Optional(EEQMASTER)
                                + pp.Optional(SOURCE)  # & because it can be in any order
                                + pp.Optional(PLACEMENT)
                                + pp.Optional(MASKSHIFT)
                                + pp.Optional(HALO)
                                + pp.Optional(ROUTEHALO)
                                + pp.Optional(WEIGHT)
                                + pp.Optional(REGION)
                                + pp.ZeroOrMore(PROPERTY)
                                + pp.Suppress(linebreak)
                               ).setResultsName('subcomponents', listAllMatches=True)

        components = pp.Group(pp.Suppress(components_id)
                                          + number('numComps')
                                          + pp.Suppress(linebreak)
                                          + pp.OneOrMore(subcomponent)
                                          + pp.Suppress(end_components_id)
                             ).setResultsName('COMPONENTS')

        return components

Ex2 (parsing time: ~ 15s):

import modules
import pyparsing as pp
from collections import defaultdict
from multiprocessing import (Process, Manager, Event)

class DEF():
    def __init__(self, Base):
        self.mydict = lambda: defaultdict(self.mydict)
        ...
        ...
        self.ignore_nets = True
        self.ignore_nets_route = False
        # Each list is a new process. Careful with dependencies.
        # eg. 'dbuPerMicron' must be executed before the others, but it can be after 'design'
        self.sections_grp = [['design', 'dbuPerMicron', 'diearea'],
                             ['components'],
                            ]
        if not self.ignore_nets:
            self.sections_grp.append(['nets'])
        self.n_elems_sections_grp = sum([len(x) for x in self.sections_grp])
        self.counter = modules.SharedCounter(0)
        self.events = [Event()]
        self.design = ''
        modules.debug_log.print_repr([self.__repr__()])

    #
    def run(self):
        for curr_file in self.def_file_design:
            ifile = open(curr_file,'r')
            file_string = ifile.read()
            ifile.close()
            self.parse_all(file_string)

    # Create a process for each section to parse
    def parse_all(self, file_string):
        manager = Manager()
        shared_dict = manager.dict()
        jobs = []
        for sections in self.sections_grp:
            p = Process(target=self.parse_sections, args=(sections, file_string, shared_dict))
            jobs.append(p)
            p.start()

        # Wait for the workers to finish
        for job in jobs:
            job.join()

        for sections in self.sections_grp:
            for section in sections:
                getattr(self, 'handle_' + section)(shared_dict)


    # Spawn the processes from each group of self.sections_grp
    def parse_sections(self, sections, file_string, shared_dict):
        for section in sections:
            to_parse = getattr(self, 'parse_' + section)
            for t, s, e in to_parse().scanString(file_string):
                shared_dict.update(t.asDict())
                break


    # Parse the DESIGN section of a .DEF file
    def parse_design(self):
        ...
        return design

    # Parse the UNITS DISTANCE MICRONS section of a .DEF file
    def parse_dbuPerMicron(self):
        ...
        return dbuPerMicron

    # Parse the DIEAREA section of a .DEF file
    def parse_diearea(self):
        ...
        return diearea

    # Parse the COMPONENTS section of a .DEF file
    def parse_components(self):
        self.events[0].wait()  # Wait for event[0] to finish
        
        EOL = pp.LineEnd().suppress()
        linebreak = pp.Suppress(";" + pp.LineEnd())
        identifier = pp.Word(pp.alphanums + '._“!<>/[]$#$%&‘*+,/:<=>?@[\]^_`{|}~')
        number = pp.pyparsing_common.number
        word = pp.Word(pp.alphas)
        LPAR = pp.Suppress('(')
        RPAR = pp.Suppress(')')
        ORIENT = (pp.Keyword('N')
                | pp.Keyword('S')
                | pp.Keyword('E')
                | pp.Keyword('W')
                | pp.Keyword('FN')
                | pp.Keyword('FS')
                | pp.Keyword('FE')
                | pp.Keyword('FW'))
        pt = LPAR + pp.OneOrMore(number | pp.Keyword('*')) + RPAR  # pair of x,y
        
        components_id = pp.Keyword('COMPONENTS')
        end_components_id = pp.Keyword("END COMPONENTS").suppress()
        begin_comp = pp.Suppress(pp.Keyword('-'))
        ws_comp = pp.Suppress(pp.Keyword('+'))  # parameter division in components

        # compName
        compName = (identifier('comp_name') + identifier('cell')
                   ).setResultsName('compName')

        ...
        ...
        ...

        subcomponent = pp.Group(begin_comp
                                + compName
                                + pp.Optional(EEQMASTER)
                                + pp.Optional(SOURCE)  # & because it can be in any order
                                + pp.Optional(PLACEMENT)
                                + pp.Optional(MASKSHIFT)
                                + pp.Optional(HALO)
                                + pp.Optional(ROUTEHALO)
                                + pp.Optional(WEIGHT)
                                + pp.Optional(REGION)
                                + pp.ZeroOrMore(PROPERTY)
                                + pp.Suppress(linebreak)
                               ).setResultsName('subcomponents', listAllMatches=True)

        components = pp.Group(pp.Suppress(components_id)
                                          + number('numComps')
                                          + pp.Suppress(linebreak)
                                          + pp.OneOrMore(subcomponent)
                                          + pp.Suppress(end_components_id)
                             ).setResultsName('COMPONENTS')

        return components

Cireo · Accepted Answer

The likely culprit seems likely to be staring at you here:

... in Ex1 the variables used by PyParsing are declared as globals before the class.

from multiprocessing import (Process, Manager, Event)

Multiprocessing may be reloading or interacting with this in a funny way. Is DEF.sections_grp always a single list (e.g. 1 process) in your timing tests?

Global variables slowing down PyParsing

Ex1 (parsing time: ~ 60s):

Ex2 (parsing time: ~ 15s):

Answers (1)

Related Questions