Reputation: 530
I can't figure out how to use multithreading/multiprocessing in python to speed up this scraping process getting all the usernames from the hashtag 'cats' on instagram.
My goal is to make this as fast as possible because currently the process is kinda slow
from instaloader import Instaloader
HASHTAG = 'cats'
loader = Instaloader(sleep=False)
users = []
for post in loader.get_hashtag_posts(HASHTAG):
if post.owner_username not in users:
users.append(post.owner_username)
print(post.owner_username)
Upvotes: 1
Views: 755
Reputation: 530
Goal is to have an input file and seperated output.txt files, maybe you can help me here to
It should be something with line 45
And i'm not really advanced so my try may contains some wrong code, I don't know
As an example hashtags for input.txt I used the: wqddt & d2deltas
from instaloader import Instaloader
import threading
import io
import time
import sys
class LockedIterator(object):
def __init__(self, it):
self.lock = threading.Lock()
self.it = it.__iter__()
def __iter__(self):
return self
def __next__(self):
self.lock.acquire()
try:
return self.it.__next__()
finally:
self.lock.release()
f = open('input.txt','r',encoding='utf-8')
HASHTAG = f.read()
p = HASHTAG.split('\n')
PROFILE = p[:]
for ind in range(len(PROFILE)):
pro = PROFILE[ind]
posts = Instaloader(sleep=False).get_hashtag_posts(pro)
posts = LockedIterator(posts)
users = set()
start_time = time.time()
PROFILE = p[:]
def worker():
for ind in range(len(PROFILE)):
pro = PROFILE[ind]
try:
filename = 'downloads/'+pro+'.txt'
fil = open(filename,'a',newline='',encoding="utf-8")
for post in posts:
hashtags = post.owner_username
fil.write(str(hashtags)+'\n')
except:
print('Skipping',pro)
threads = []
for i in range(4): #Input Threads
t = threading.Thread(target=worker)
threads.append(t)
t.start()
for t in threads:
t.join()
end_time = time.time()
print("Done")
print("Time taken : " + str(end_time - start_time) + "sec")
Upvotes: 0
Reputation: 8576
The LockedIterator
is inspired from here.
import threading
from instaloader import Instaloader
class LockedIterator(object):
def __init__(self, it):
self.lock = threading.Lock()
self.it = it.__iter__()
def __iter__(self):
return self
def __next__(self):
self.lock.acquire()
try:
return self.it.__next__()
finally:
self.lock.release()
HASHTAG = 'cats'
posts = Instaloader(sleep=False).get_hashtag_posts(HASHTAG)
posts = LockedIterator(posts)
users = set()
def worker():
try:
for post in posts:
print(post.owner_username)
users.add(post.owner_username)
except Exception as e:
print(e)
raise
threads = []
for i in range(4):
t = threading.Thread(target=worker)
threads.append(t)
t.start()
for t in threads:
t.join()
Upvotes: 2