Reputation: 284
I'm manipulating a code to download the1st 5 image result from google image search. However, I run into 2 major issues with the code following:
from bs4 import BeautifulSoup
import urllib.request
import os
import json
def get_soup(url,header):
return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)),'html.parser')
query = input('>>> What image do you want? ')
image_type=query
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
print ('>>> Base searching page from Google image:', url)
DIR="C:/Users/alex/Desktop/try"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"]
ActualImages.append((link,Type))
print('>>> Base page has', len(ActualImages),'images in total')
if not os.path.exists(DIR):
os.mkdir(DIR)
DIR = os.path.join(DIR, query.split()[0])
if not os.path.exists(DIR):
os.mkdir(DIR)
###print images
for i,(img,Type) in enumerate(ActualImages[:5]):
try:
req = urllib.request.Request(img, headers={'User-Agent' : header})
raw_img = urllib.request.urlopen(req).read()
cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1
print(cntr)
if len(Type)==0:
f = open(os.path.join(DIR,image_type + "_"+ str(cntr)+".jpg"),'wb')
else:
f = open(os.path.join(DIR,image_type + "_"+ str(cntr)+"."+Type),'wb')
f.write(raw_img)
f.close()
except Exception as e:
print('>>> Could not load: '+img)
print(e)
print ('>>> Finished!')
Q1: In the line of
req = urllib.request.Request(img, headers={'User-Agent' : header})
Python will show me an error saying expected string or bytes-like object, but if I remove headers={'User-Agent' : header}
,the code works alright. I know the header acts as a permit, but having it prohibits the code from functioning is weird. Could someone help on this issue?
Q2: According to several tests, I sometimes got HTTP Error 403: Forbidden
. Which part I should change to let Python keep trying until I got 5 successful downloads of images rather than showing me it tried 5 times but 1 failed to download?
Upvotes: 0
Views: 286
Reputation: 3419
The problem is in the headers of your request
Simply change the line
req = urllib.request.Request(img, headers={'User-Agent' : header})
to
req = urllib.request.Request(img, headers=header)
The modified code
from bs4 import BeautifulSoup
import urllib.request
import os
import json
def get_soup(url,header):
return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)),'html.parser')
query = input('>>> What image do you want? ')
image_type=query
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
print ('>>> Base searching page from Google image:', url)
DIR="/home/fly/Documents/py/"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"]
ActualImages.append((link,Type))
print('>>> Base page has', len(ActualImages),'images in total')
if not os.path.exists(DIR):
os.mkdir(DIR)
DIR = os.path.join(DIR, query.split()[0])
if not os.path.exists(DIR):
os.mkdir(DIR)
###print images
for i,(img,Type) in enumerate(ActualImages[:5]):
try:
req = urllib.request.Request(img, headers=header)
raw_img = urllib.request.urlopen(req).read()
cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1
print(cntr)
if len(Type)==0:
f = open(os.path.join(DIR,image_type + "_"+ str(cntr)+".jpg"),'wb')
else:
f = open(os.path.join(DIR,image_type + "_"+ str(cntr)+"."+Type),'wb')
f.write(raw_img)
f.close()
except Exception as e:
print('>>> Could not load: '+img)
print(e)
print ('>>> Finished!')
Output
>>> What image do you want? cat
>>> Base searching page from Google image: https://www.google.co.in/search?q=cat&source=lnms&tbm=isch
>>> Base page has 100 images in total
1
2
3
4
5
>>> Finished!
Upvotes: 1