Dani Mazahreh
Dani Mazahreh

Reputation: 70

How to make this python script run faster?

Im trying to parse mediawiki dump files, by opening each URL in the abstract dump file and parsing it using BS4. I have about 600k URLs which according to my estimation will take 200 Hour.

sentenceTokens = []


with open('arwiki-latest-abstract.txt', newline='', encoding='utf-8') as textFile:  # open text file
    for line in textFile:
        if '<url>' in line:
            line = re.sub('<[^>]+>', '', line)  # remove <> and anything within
            line = re.sub('\n', '', line)
            print(line)
            requestURL = urllib.request.Request(line, headers={'User-Agent': 'Mozilla/5.0'})  # read webpage
            try:
                scrapeURL = urllib.request.urlopen(requestURL)  # scrape webpage
            except urllib.error.HTTPError as err:
                continue
            article = scrapeURL.read()
            parsedArticle = bs.BeautifulSoup(article, 'lxml')  # parse webpage
            paragraphs = parsedArticle.find_all('p')  # split article to paragraphs
            textFromURL = ""
            for paragraph in paragraphs:
                textFromURL += paragraph.text  # concat paragraphs
            textFromURL = re.sub('[\[].*?[\]]', ' ', textFromURL)  # remove [] and anything within
            textFromURL = re.sub(r'\s+', ' ', textFromURL)  # fix spaces
            textFromURL = araby.strip_tashkeel(textFromURL)
            sentenceTokens += nltk.sent_tokenize(textFromURL)
worddict, wordcount = vocab.build_dictionary(sentenceTokens)
vocab.save_dictionary(worddict, wordcount, 'D:\\Unsupervisedsummarization')
with open('listtext.txt', 'wb', encoding='utf-8') as strlist:
    pickle.dump(sentenceTokens, strlist)

If anyone could parallelize this code please or offer some tips on how to make it run faster. Thanks!

Upvotes: 0

Views: 196

Answers (1)

Dmitrii
Dmitrii

Reputation: 917

As mentioned in comments, you need to parallelize url requests and bs4 parsing, as most slow processes, like in the example:

Code

#!/usr/bin/python3
# -*- coding: utf-8 -*-

from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads

limit = 10
http_ok = [200]


async def scrape(url_list):

    tasks = list()

    sem = Semaphore(limit)

    async with ClientSession() as session:
        for url in url_list:
            task = ensure_future(scrape_bounded(url, sem, session))
            tasks.append(task)

        result = await gather(*tasks)

    return result


async def scrape_bounded(url, sem, session):
    async with sem:
        return await scrape_one(url, session)


async def scrape_one(url, session):

    try:
        async with session.get(url) as response:
            content = await response.read()
    except client_exceptions.ClientConnectorError:
        print('Scraping %s failed due to the connection problem', url)
        return False

    if response.status not in http_ok:
        print('Scraping%s failed due to the return code %s', url, response.status)
        return False

    content = loads(content.decode('UTF-8'))

    return content


if __name__ == '__main__':
    urls = ['http://demin.co/echo1/', 'http://demin.co/echo2/']
    res = run(scrape(urls))

    print(dumps(res, indent=4))

Output

[
    {
        "method": "GET",
        "path": "/",
        "ip": "188.103.31.169",
        "headers": {
            "X-Forwarded-Host": "demin.co",
            "X-Forwarded-Port": "80",
            "X-Forwarded-Proto": "http",
            "X-Forwarded-Agent": "Python/3.7 aiohttp/3.5.4",
            "X-Forwarded-Request": "GET /echo1/ HTTP/1.1"
        }
    },
    {
        "method": "GET",
        "path": "/",
        "ip": "188.103.31.169",
        "headers": {
            "X-Forwarded-Host": "demin.co",
            "X-Forwarded-Port": "80",
            "X-Forwarded-Proto": "http",
            "X-Forwarded-Agent": "Python/3.7 aiohttp/3.5.4",
            "X-Forwarded-Request": "GET /echo2/ HTTP/1.1"
        }
    }
]

You can see source code here.

Upvotes: 1

Related Questions