Reputation: 137
I've created a script in python using multiprocessing
library to scrape certain fields from a webpage. As I don't have any knowledge as to how I can make a go using multiprocessing
I get an error when I execute the below script:
import requests
from lxml.html import fromstring
from multiprocessing import Process
link = "https://www.yellowpages.com/search?search_terms=coffee&geo_location_terms=Los%20Angeles%2C%20CA&page={}"
def create_links(url):
response = requests.get(url).text
tree = fromstring(response)
for title in tree.cssselect("div.info"):
name = title.cssselect("a.business-name span")[0].text
street = title.cssselect("span.street-address")[0].text
try:
phone = title.cssselect("div[class^=phones]")[0].text
except IndexError:
phone = ""
print(name, street, phone)
if __name__ == '__main__':
links = [link.format(page) for page in range(4)]
p = Process(target=create_links, args=(links,))
p.start()
p.join()
Error I'm having:
722, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
I'm getting that error because the script considers the list of links as an individual link whereas I knew I had to pass list of links within args=(links,)
. How may I run it successfully?
Upvotes: 0
Views: 86
Reputation: 22440
If you wanted to stick to Process
then the following should work:
import requests
from lxml.html import fromstring
from multiprocessing import Process
link = "https://www.yellowpages.com/search?search_terms=coffee&geo_location_terms=Los%20Angeles%2C%20CA&page={}"
def create_links(url):
response = requests.get(url).text
tree = fromstring(response)
for title in tree.cssselect("div.info"):
name = title.cssselect("a.business-name span")[0].text
try:
street = title.cssselect("span.street-address")[0].text
except IndexError: street = ""
try:
phone = title.cssselect("div[class^=phones]")[0].text
except IndexError: phone = ""
print(name, street, phone)
if __name__ == '__main__':
items = []
for links in [link.format(page) for page in range(1,6)]:
p = Process(target=create_links, args=(links,))
items.append(p)
p.start()
for process in items:
process.join()
Upvotes: 1
Reputation: 8164
Works fine with Pool
import requests
from lxml.html import fromstring
from multiprocessing import Pool
link = "https://www.yellowpages.com/search?search_terms=coffee&geo_location_terms=Los%20Angeles%2C%20CA&page={}"
def create_links(url):
response = requests.get(url).text
tree = fromstring(response)
for title in tree.cssselect("div.info"):
name = title.cssselect("a.business-name span")[0].text
street = title.cssselect("span.street-address")[0].text
try:
phone = title.cssselect("div[class^=phones]")[0].text
except IndexError:
phone = ""
print(name, street, phone)
links = [link.format(page) for page in range(4)]
def main():
with Pool(4) as p:
print(p.map(create_links, links))
if __name__ == '__main__':
main()
Output
Caffe Latte 6254 Wilshire Blvd (323) 936-5213
Bourgeois Pig 5931 Franklin Ave (323) 464-6008
Beard Papa Sweet Cafe 6801 Hollywood Blvd Ste 157 (323) 462-6100
Intelligentsia Coffee 3922 W Sunset Blvd (323) 663-6173
The Downbeat Cafe 1202 N Alvarado St (213) 483-3955
Sabor Y Cultura 5625 Hollywood Blvd (323) 466-0481
The Wood Cafe 12000 Washington Pl (310) 915-9663
Groundwork Coffee Inc 1501 N Cahuenga Blvd (323) 871-0143
The Apple Pan 10801 W Pico Blvd (310) 475-3585
Good Microbrew & Grill 3725 W Sunset Blvd (323) 660-3645
The Standard Hollywood 8300 W Sunset Blvd (323) 650-9090
Upvotes: 3
Reputation: 1736
You can use Pool from multiprocessing
from multiprocessing import Pool
and specify processes as
links = [link.format(page) for page in range(4)]
p = Pool(10) # number of process at a time
link = p.map(parse, links)
p.terminate()
p.join()
Upvotes: 2