Reputation: 70
Im trying to parse mediawiki dump files, by opening each URL in the abstract dump file and parsing it using BS4. I have about 600k URLs which according to my estimation will take 200 Hour.
sentenceTokens = []
with open('arwiki-latest-abstract.txt', newline='', encoding='utf-8') as textFile: # open text file
for line in textFile:
if '<url>' in line:
line = re.sub('<[^>]+>', '', line) # remove <> and anything within
line = re.sub('\n', '', line)
print(line)
requestURL = urllib.request.Request(line, headers={'User-Agent': 'Mozilla/5.0'}) # read webpage
try:
scrapeURL = urllib.request.urlopen(requestURL) # scrape webpage
except urllib.error.HTTPError as err:
continue
article = scrapeURL.read()
parsedArticle = bs.BeautifulSoup(article, 'lxml') # parse webpage
paragraphs = parsedArticle.find_all('p') # split article to paragraphs
textFromURL = ""
for paragraph in paragraphs:
textFromURL += paragraph.text # concat paragraphs
textFromURL = re.sub('[\[].*?[\]]', ' ', textFromURL) # remove [] and anything within
textFromURL = re.sub(r'\s+', ' ', textFromURL) # fix spaces
textFromURL = araby.strip_tashkeel(textFromURL)
sentenceTokens += nltk.sent_tokenize(textFromURL)
worddict, wordcount = vocab.build_dictionary(sentenceTokens)
vocab.save_dictionary(worddict, wordcount, 'D:\\Unsupervisedsummarization')
with open('listtext.txt', 'wb', encoding='utf-8') as strlist:
pickle.dump(sentenceTokens, strlist)
If anyone could parallelize this code please or offer some tips on how to make it run faster. Thanks!
Upvotes: 0
Views: 196
Reputation: 917
As mentioned in comments, you need to parallelize url requests and bs4 parsing, as most slow processes, like in the example:
Code
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads
limit = 10
http_ok = [200]
async def scrape(url_list):
tasks = list()
sem = Semaphore(limit)
async with ClientSession() as session:
for url in url_list:
task = ensure_future(scrape_bounded(url, sem, session))
tasks.append(task)
result = await gather(*tasks)
return result
async def scrape_bounded(url, sem, session):
async with sem:
return await scrape_one(url, session)
async def scrape_one(url, session):
try:
async with session.get(url) as response:
content = await response.read()
except client_exceptions.ClientConnectorError:
print('Scraping %s failed due to the connection problem', url)
return False
if response.status not in http_ok:
print('Scraping%s failed due to the return code %s', url, response.status)
return False
content = loads(content.decode('UTF-8'))
return content
if __name__ == '__main__':
urls = ['http://demin.co/echo1/', 'http://demin.co/echo2/']
res = run(scrape(urls))
print(dumps(res, indent=4))
Output
[
{
"method": "GET",
"path": "/",
"ip": "188.103.31.169",
"headers": {
"X-Forwarded-Host": "demin.co",
"X-Forwarded-Port": "80",
"X-Forwarded-Proto": "http",
"X-Forwarded-Agent": "Python/3.7 aiohttp/3.5.4",
"X-Forwarded-Request": "GET /echo1/ HTTP/1.1"
}
},
{
"method": "GET",
"path": "/",
"ip": "188.103.31.169",
"headers": {
"X-Forwarded-Host": "demin.co",
"X-Forwarded-Port": "80",
"X-Forwarded-Proto": "http",
"X-Forwarded-Agent": "Python/3.7 aiohttp/3.5.4",
"X-Forwarded-Request": "GET /echo2/ HTTP/1.1"
}
}
]
You can see source code here.
Upvotes: 1