Ethan Leong
Ethan Leong

Reputation: 27

MemoryError when extracting articles into list using gensim WikiCorpus

I wish to build a corpus from a Wikipedia Dump (~19GB compressed .bz2 file). But, I encountered MemoryError when I try to run the code as shown. Is there any solution that can solve this issue?

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.corpora import WikiCorpus
import sys


def make_corpus(in_f, out_f):
    output = open(out_f, 'w')
    print("File Created!")
    wiki = WikiCorpus(in_f)
    print("Wiki Opened!")
    i = 0

    for text in wiki.get_texts():
        output.write(bytes(' '.join(text).encode('utf-8')).decode('utf-8') + '\n')
        i = i + 1
        if (i % 10000 == 0):
            print('Processed ' + str(i) + ' articles...')

    output.close()
    print('Processing Completed!')


if __name__ == '__main__':
    if len(sys.argv) !=3:
        sys.exit(1)

    in_f = sys.argv[1]
    out_f = sys.argv[2]
    make_corpus(in_f, out_f)

Traceback (most recent call last): File "", line 1, in File "C:\Users\asus\anaconda3\envs\tensorflow\lib\multiprocessing\spawn.py", line 105, in spawn_main exitcode = _main(fd) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\multiprocessing\spawn.py", line 114, in _main prepare(preparation_data) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\multiprocessing\spawn.py", line 225, in prepare fixup_main_from_path(data['init_main_from_path']) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\multiprocessing\spawn.py", line 277, in fixup_main_from_path run_name="mp_main") File "C:\Users\asus\anaconda3\envs\tensorflow\lib\runpy.py", line 263, in run_path pkg_name=pkg_name, script_name=fname) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\runpy.py", line 96, in run_module_code mod_name, mod_spec, pkg_name, script_name) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\runpy.py", line 85, in run_code exec(code, run_globals) File "d:\LeongJC\FYP_Code\Code\wikipedia_transformation.py", line 3, in import gensim File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim_init.py", line 11, in from gensim import parsing, corpora, matutils, interfaces, models, similarities, utils # noqa:F401 File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\corpora_init.py", line 6, in from .indexedcorpus import IndexedCorpus # noqa:F401 must appear before the other classes File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\corpora\indexedcorpus.py", line 14, in from gensim import interfaces, utils File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\interfaces.py", line 19, in from gensim import utils, matutils File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\matutils.py", line 19, in from scipy.stats import entropy File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\scipy\stats_init.py", line 388, in from .stats import * File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\scipy\stats\stats.py", line 174, in from scipy.spatial.distance import cdist File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\scipy\spatial_init.py", line 101, in from .procrustes import procrustes File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\scipy\spatial_procrustes.py", line 9, in from scipy.linalg import orthogonal_procrustes File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\scipy\linalg_init.py", line 194, in from .misc import * File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\scipy\linalg\misc.py", line 4, in from .lapack import get_lapack_funcs File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\scipy\linalg\lapack.py", line 783, in from scipy.linalg import _flapack ImportError: DLL load failed: The paging file is too small for this operation to complete. multiprocessing.pool.RemoteTraceback: """ Traceback (most recent call last): File "C:\Users\asus\anaconda3\envs\tensorflow\lib\multiprocessing\pool.py", line 119, in worker result = (True, func(*args, **kwds)) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\corpora\wikicorpus.py", line 530, in _process_article token_max_len=token_max_len, lower=lower, File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\corpora\wikicorpus.py", line 490, in process_article result = tokenizer_func(text, token_min_len, token_max_len, lower) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\corpora\wikicorpus.py", line 361, in tokenize utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore') File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\utils.py", line 264, in tokenize text = text.lower() MemoryError """

The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "d:/LeongJC/FYP_Code/Code/wikipedia_transformation.py", line 31, in make_corpus(in_f, out_f) File "d:/LeongJC/FYP_Code/Code/wikipedia_transformation.py", line 11, in make_corpus wiki = WikiCorpus(in_f) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\corpora\wikicorpus.py", line 639, in init self.dictionary = Dictionary(self.get_texts()) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\corpora\dictionary.py", line 78, in init self.add_documents(documents, prune_at=prune_at) File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\corpora\dictionary.py", line 196, in add_documents for docno, document in enumerate(documents): File "C:\Users\asus\anaconda3\envs\tensorflow\lib\site-packages\gensim\corpora\wikicorpus.py", line 693, in get_texts for tokens, title, pageid in pool.imap(_process_article, group): File "C:\Users\asus\anaconda3\envs\tensorflow\lib\multiprocessing\pool.py", line 735, in next raise value MemoryError

CMD Error Message1 CMD Error Message2

Upvotes: 2

Views: 138

Answers (1)

gojomo
gojomo

Reputation: 54153

By default, the WikiCorpus class surveys the entire dump file's vocabulary upon creation, even though most users don't need that. And, it's during that step you're hitting this MemoryError.

However, if you supply an empty Python dict at WikiCorpus creation, it'll skip this time-consuming & memory-consuming step. A

Specifically, change your line...

    wiki = WikiCorpus(in_f)

...to...

    wiki = WikiCorpus(in_f, dictionary={})

After this change, you may not have any further problems, as it looks like your code is otherwise doing things in an incremental fashion that shouldn't use much memory even on a small-memory machine.

Upvotes: 1

Related Questions