Reputation: 1163
My plan is to that I want to hurry up my process that instead of checking one by one links, I would like to check all the links at the same time and have a global list where I append all the found links. And only print out once we find a new link.
I have created something like this but i'm getting stuck on how I can apply threading in my situation:
import time
def parseNewArticles():
siteCatalog = ["https://www.google.se/search?sxsrf=ALeKk03MfsHFjiuq1PInpgtEw0x_ctZKvA%3A1603437501435&source=hp&ei=vYOSX7u3F4PAa8qEpbgM&q=hello+world&oq=hello+world&gs_lcp=CgZwc3ktYWIQAzICCAAyBQguEMsBMgIIADICCAAyAggAMgUIABDLATIFCAAQywEyAggAMgIILjICCAA6BwgjEOoCECc6CQgjEOoCECcQEzoECCMQJzoICAAQsQMQgwE6BQgAELEDOgQIABBDOgcILhBDEJMCOgUILhCxAzoKCC4QsQMQQxCTAjoECC4QQzoICC4QsQMQgwE6BQguEJMCOgQIABAKOgQILhAKOgcIABCxAxAKOgcIABAKEMsBUJMMWPQeYIgfaAVwAHgAgAHAAYgB9wiSAQQxMy4xmAEAoAEBqgEHZ3dzLXdperABCg&sclient=psy-ab&ved=0ahUKEwj7icKVlsrsAhUD4BoKHUpCCccQ4dUDCAc&uact=5",
"https://www.google.se/search?sxsrf=ALeKk01qENv3p5Dla2i8YR20XykwVN2DOA%3A1603437505803&ei=wYOSX-WMMK_rrgT73KqQCA&q=stackoverflow&oq=stackoverflow&gs_lcp=CgZwc3ktYWIQAzIECCMQJzIECCMQJzICCAAyBQgAEMsBMgUIABDLATIFCAAQywEyBQgAEMsBMgIIADIFCAAQywEyBQgAEMsBOgQILhAnOgUIABCxAzoICAAQsQMQgwE6BAgAEEM6BwgAELEDEEM6BAguEEM6BwguELEDEEM6AgguOgUILhCxA1DsMli3O2D1O2gAcAF4AIABZYgBlQeSAQQxMi4xmAEAoAEBqgEHZ3dzLXdpesABAQ&sclient=psy-ab&ved=0ahUKEwil8c6XlsrsAhWvtYsKHXuuCoIQ4dUDCA0&uact=5",
"https://www.google.se/search?sxsrf=ALeKk017_VtlAFReIFyxqxp4dEQ0SVqFRg%3A1603437514396&ei=yoOSX4XNF4KEwPAPtOuZiAQ&q=weather&oq=weather&gs_lcp=CgZwc3ktYWIQAzIFCAAQywEyBQgAEMsBMgUIABDLATIFCAAQywEyBQgAEMsBMgUIABDLATIFCAAQywEyBQgAEMsBMgUIABDLATIFCAAQywE6BAgAEEc6BAgjECc6BQgAELEDOgQIABBDOgIIADoHCAAQsQMQQzoKCAAQsQMQgwEQQzoICAAQsQMQgwE6AgguUIA8WIFBYK9BaABwAngAgAFaiAGyA5IBATaYAQCgAQGqAQdnd3Mtd2l6yAEIwAEB&sclient=psy-ab&ved=0ahUKEwiF2tublsrsAhUCAhAIHbR1BkEQ4dUDCA0&uact=5"]
articelsLists = [] # Global List that I want to append from all the links above
### Here is where I want each link from siteCatalog to be in each threading ###
while True:
newArticleLists = # Will be a function that scrapes all the articles and adds it to a own list
for articles in newArticleLists:
if articles not in articelsLists:
articelsLists.append(articles)
print(f"[Found: {articles}]")
time.sleep(600)
else:
print(f"[Total in list: {len(articelsLists)}]")
time.sleep(600)
continue
My question is how can I apply that each link has its own thread and append to a global list?
EDIT:
import time
from threading import Thread
articelsLists = [] # Global List
def get_new_article_lists(site):
newArticleLists = list() # to extract articles from site
for articles in newArticleLists:
if articles not in articelsLists:
articelsLists.append(articles)
print(f"[Found: {articles}]")
else:
print(f"No new articles found!")
time.sleep(600)
def parseNewArticles():
siteCatalog = ["https://www.google.se/search?sxsrf=blah",
"https://www.google.se/search?sxsrf=blahblah",
"https://www.google.se/search?sxsrf=blahblahblah",]
for site in siteCatalog):
Thread(target=get_new_article_lists, args=(site, )).start()
Upvotes: 2
Views: 957
Reputation: 44013
You should use a thread pool to retrieve the URLs and scrape the articles. Here, I am using the ThreadPoolExecutor
class from the concurrent.futures
module. The most efficient way to detect/remove duplicate articles is to use a set
:
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
def get_article_list(url):
"""
This function returns scrapes a URL and returns a list of articles
"""
pass # for now
def parseNewArticles():
siteCatalog = ["https://www.google.se/search?sxsrf=ALeKk03MfsHFjiuq1PInpgtEw0x_ctZKvA%3A1603437501435&source=hp&ei=vYOSX7u3F4PAa8qEpbgM&q=hello+world&oq=hello+world&gs_lcp=CgZwc3ktYWIQAzICCAAyBQguEMsBMgIIADICCAAyAggAMgUIABDLATIFCAAQywEyAggAMgIILjICCAA6BwgjEOoCECc6CQgjEOoCECcQEzoECCMQJzoICAAQsQMQgwE6BQgAELEDOgQIABBDOgcILhBDEJMCOgUILhCxAzoKCC4QsQMQQxCTAjoECC4QQzoICC4QsQMQgwE6BQguEJMCOgQIABAKOgQILhAKOgcIABCxAxAKOgcIABAKEMsBUJMMWPQeYIgfaAVwAHgAgAHAAYgB9wiSAQQxMy4xmAEAoAEBqgEHZ3dzLXdperABCg&sclient=psy-ab&ved=0ahUKEwj7icKVlsrsAhUD4BoKHUpCCccQ4dUDCAc&uact=5",
"https://www.google.se/search?sxsrf=ALeKk01qENv3p5Dla2i8YR20XykwVN2DOA%3A1603437505803&ei=wYOSX-WMMK_rrgT73KqQCA&q=stackoverflow&oq=stackoverflow&gs_lcp=CgZwc3ktYWIQAzIECCMQJzIECCMQJzICCAAyBQgAEMsBMgUIABDLATIFCAAQywEyBQgAEMsBMgIIADIFCAAQywEyBQgAEMsBOgQILhAnOgUIABCxAzoICAAQsQMQgwE6BAgAEEM6BwgAELEDEEM6BAguEEM6BwguELEDEEM6AgguOgUILhCxA1DsMli3O2D1O2gAcAF4AIABZYgBlQeSAQQxMi4xmAEAoAEBqgEHZ3dzLXdpesABAQ&sclient=psy-ab&ved=0ahUKEwil8c6XlsrsAhWvtYsKHXuuCoIQ4dUDCA0&uact=5",
"https://www.google.se/search?sxsrf=ALeKk017_VtlAFReIFyxqxp4dEQ0SVqFRg%3A1603437514396&ei=yoOSX4XNF4KEwPAPtOuZiAQ&q=weather&oq=weather&gs_lcp=CgZwc3ktYWIQAzIFCAAQywEyBQgAEMsBMgUIABDLATIFCAAQywEyBQgAEMsBMgUIABDLATIFCAAQywEyBQgAEMsBMgUIABDLATIFCAAQywE6BAgAEEc6BAgjECc6BQgAELEDOgQIABBDOgIIADoHCAAQsQMQQzoKCAAQsQMQgwEQQzoICAAQsQMQgwE6AgguUIA8WIFBYK9BaABwAngAgAFaiAGyA5IBATaYAQCgAQGqAQdnd3Mtd2l6yAEIwAEB&sclient=psy-ab&ved=0ahUKEwiF2tublsrsAhUCAhAIHbR1BkEQ4dUDCA0&uact=5"]
MAX_THREADS_TO_EVER_USE = 100 #depends on many factors; you must experiemnent
with ThreadPoolExecutor(max_workers=min(len(siteCatalog), MAX_THREADS_TO_EVER_USE)) as executor:
new_articles_set = set() # to remove duplicates
while True:
futures = [executor.submit(get_article_list, url) for url in siteCatalog]
for future in as_completed(futures):
new_article_list = future.result()
for new_article in new_article_list:
if new_article not in new_articles_set:
print(f'Found {new_article}')
new_articles_set.add(new_article) # there will be no duplicates in the set
time.sleep(600)
Upvotes: 1
Reputation: 72735
I would recommend that instead of using a separate worker for each URL, you create a pool of workers and have them run the downloads. You can scale up or down the number of workers (hence the concurrency) depending on your hardware and other requirement. Here's a stripped down implementation.
import time
from multiprocessing.pool import ThreadPool
urls = ['https://www.google.com', 'https://www.yahoo.com', 'https://www.bing.com']*10
def download(url):
print ("downloading {}".format(url))
time.sleep(1)
return "done"
def download_all(workers = 10):
pool = ThreadPool(workers)
results = pool.map(download, urls)
print (results[:10])
if __name__ == "__main__":
download_all()
By changing the number of workers in the call to download_all
, you can scale up or scale down your solution.
Upvotes: 1
Reputation: 184
As a simple or maybe naive approach, you can use python threading library.
import time
from threading import Thread
articelsLists = [] # Global List
def get_new_article_lists(site):
newArticleLists = list() # to extract articles from site
for articles in newArticleLists:
if articles not in articelsLists:
articelsLists.append(articles)
print(f"[Found: {articles}]")
else:
print(f"[Total in list: {len(articelsLists)}]")
time.sleep(600)
def parseNewArticles():
siteCatalog = ["https://www.google.se/search?sxsrf=blah",
"https://www.google.se/search?sxsrf=blahblah",
"https://www.google.se/search?sxsrf=blahblahblah",]
threads = dict()
for i, site in enumerate(siteCatalog):
threads[i] = Thread(target=get_new_article_lists, args=(site, ))
threads[i].start()
try:
for i in threads.keys():
threads[i].join()
except:
# handle excptions
pass
Upvotes: 1