Reputation: 523
I'm having some issues with urllopen.rquest
, I can't even say exactly what's going on, so I'll provide the code and the traceback. First, the code:
import re as rex
import urllib.request
import os
#qui ci piazzo le classi per i vari siti
class Amazon:
def __init__(self,manager,dati):
self.manager=manager
self.indirizzo=None
self.datiAttuali=dati
self.URLimmagine=None
self.prezzo=None
self.speseSped=None
self.setURLimmagine()
self.setPrezzoSpese()
self.caricaDati()
def setURLimmagine(self):
patBlocco=rex.compile(r'<div id="imgTagWrapperId"[\s\S]*</div>')
patUrl=rex.compile(r'http://ecx.images-amazon.com/images/I/[^.]{0,13}')
blocco=rex.search(patBlocco,self.datiAttuali)
if blocco==None:
self.URLimmagine='ND'
else:
url=rex.search(patUrl,blocco.group())
if url!=None:
self.URLimmagine=url.group()+'.jpg'
else:
self.URLimmagine='blocco trovato,ND'
def setPrezzoSpese(self):
patBlocco=rex.compile(r'<div class="a-box"[\s\S]*</div>')
blocco=rex.search(patBlocco,self.datiAttuali)
if blocco!=None:
patSpanPrezzo=rex.compile(r'<span class="a-color-price price3P[\s\S]*</span>')
patSpanSped=rex.compile(r'<span class="a-size-small a-color-secondary shipping3P[\s\S]*</span>')
spanPrezzo=rex.search(patSpanPrezzo,self.datiAttuali)
spanSped=rex.search(patSpanSped,self.datiAttuali)
if spanPrezzo!=None:
patPrezzoText=rex.compile(r'EUR \w+,\w{0,2}')
patSpedText=rex.compile(r'(EUR \w+,\w{0,2})|(Spedizione gratuita)')
prezzoPulito=rex.search(patPrezzoText,spanPrezzo.group()).group()
spedPulito=rex.search(patSpedText,spanSped.group()).group()
self.prezzo=prezzoPulito
self.speseSped=spedPulito
else:
patPrezzo=rex.compile(r'(<span id="priceblock_saleprice"[\s\S]*</span>)|(<span id="priceblock_ourprice"[\s\S]*</span>)')
prezzo=rex.search(patPrezzo,self.datiAttuali)
if prezzo!=None:
patPrezzoRifinito=rex.compile(r'(EUR \w+,\w{0,2})( - )?(EUR \w+,\w{0,2})?')
prezzoRifinito=rex.search(patPrezzoRifinito,prezzo.group())
prezzoRaffinato=prezzoRifinito.group()
self.prezzo=prezzoRaffinato
self.speseSped='ND'
else:
self.prezzo='ND'
self.speseSped='ND'
else:
patPrezzo=rex.compile(r'<span id="priceblock_ourprice"[\s\S]*</span>')
prezzo=rex.search(patPrezzo,self.datiAttuali)
if prezzo!=None:
patPrezzoPulito=rex.compile(r'rice">[\s\S]{1,21}</span>')
prezzoPulito=rex.search(patPrezzoPulito,prezzo.group())
prezzoFinale=prezzoPulito.group()
patPrezzoRifinito=rex.compile(r'(EUR \w+,\w{0,2})( - )?(EUR \w+,\w{0,2})?')
prezzoRifinito=rex.search(patPrezzoRifinito,prezzoFinale)
prezzoRaffinato=prezzoRifinito.group()
self.prezzo=prezzoRaffinato
self.speseSped='ND'
else:
self.prezzo='ND'
self.speseSped='ND'
def caricaDati(self):
self.manager.URLimmagine=self.URLimmagine
self.manager.prezzo=self.prezzo
self.manager.spedizione=self.speseSped
#qui la classe principale
class DBmanager:
def __init__(self,nome):
self.percorso=None
self.nomeDB=nome
self.siti={'amazon':lambda self,URL:Amazon(self,dati)}
self.nomeSito=None #quando verrà passato un URL, per prima cosa verrà pescato il nome del sito, per poter chiamare la
#classe adeguata
self.ID=None
self.URL=None
self.sorgente=None
self.URLimmagine=None
self.prezzo=None
self.spedizione=None
self.descrizione='descrizione'
self.contenuti=[] #qui verranno posizionati gli atomi, creati dopo aver raccolto le info necessarie tramite la classe
#apposita
self.setPercorso()
self.inizializza(nome)
def setPercorso(self):
file=open('config.txt')
percorso=file.readline()
file.close()
self.percorso=percorso[:len(percorso)-1] #questo serve a togliere il \n finale
def setURL(self,URL):
self.URL=URL
pat=rex.compile=r'.+\.(.*)\.'
trovaNome=rex.search(pat,self.URL)
self.nomeSito=trovaNome.group(1)
apri=urllib.request.urlopen(URL)
self.sorgente=str(apri.read())
def chiamaSito(self):
self.siti[self.nomeSito](self,self.sorgente)
def inizializza(self,nome):
try:
file=open(self.percorso+'DATABASE/'+nome+'.adb','r')
file.close()
except:
file=open(self.percorso+'DATABASE/'+nome+'.adb','w')
file.write('<<0>>\n') #se il file adb non esiste, crealo e scrivi l'ID generale
file.close()
os.mkdir(self.percorso+'DATABASE/'+nome) #crea la cartella che conterrà le immagini
def main():
nome='piripacchio'
URL=r'http://www.amazon.it/IMMACOLATA-PORCELLANA-SCULTURA-IMMACULATE-INMACULAD/dp/B016APLOEE/ref=sr_1_1?ie=UTF8&qid=1453325779&sr=8-1&keywords=madonna+statua'
man=DBmanager(nome)
man.setURL(URL)
man.chiamaSito()
if __name__=='__main__':
main()
The idea for this program is that the user enters an URL from Amazon or other sites like that and the program retrieves the image of the object, the sell price and (eventually) shipping fees. This is done by creating a sort of simple database (DBmanager
will populate the list self.contenuti with some dictionaries containing the data). Now, DBmanager
has to beahave exactly the same, no matter the site, and this is accomplished by creating specific classes (like the class Amazon
) that will have the specific functions for retrieving the data (every site has is own structure, so I can't use the same regex) and simply putting them in the dictionary self.siti
: whenever self.chiamaSito
is called, the appropriate class (here I have only Amazon
) will be called passing self.sorgente
as an argument, and this will be the content of the site provided, converted into str, so the class Amazon can search the appropriate informations via regex.
This is for home uses, I already created a similar program that, with the data collected, writes a simple HTML file that is a sort of image gallery of all the objects of interest. What I want to do here, with DBmanager
, is downloading the image so it will be displayed into a tkinter label when I will write the GUI. No HTML file needed, the whole thing will be GUI based, like a sort of database.
The problem is that, whatever URL I provide (here I'm doing it in the main()
function, for debug purposes) I get this evil and nasty "thing":
File "C:\Users\Admin\pyproj\amazons\amazzone2_0.py", line 178, in <module>
main()
File "C:\Users\Admin\pyproj\amazons\amazzone2_0.py", line 173, in main
man.setURL(URL)
File "C:\Users\Admin\pyproj\amazons\amazzone2_0.py", line 148, in setURL
apri=urllib.request.urlopen(URL)
File "C:\Python34\Lib\urllib\request.py", line 153, in urlopen
return opener.open(url, data, timeout)
File "C:\Python34\Lib\urllib\request.py", line 440, in open
req = Request(fullurl, data)
File "C:\Python34\Lib\urllib\request.py", line 258, in __init__
self.full_url = url
File "C:\Python34\Lib\urllib\request.py", line 283, in full_url
self._full_url, self.fragment = splittag(self._full_url)
File "C:\Python34\Lib\urllib\parse.py", line 952, in splittag
_tagprog = re.compile('^(.*)#([^#]*)$')
builtins.TypeError: 'str' object is not callable
From what I understand, the problem is in urllib: for some reason the URL is not parsed correctly and I end up with the error. So, I tried to write this:
import urllib.request
sito=urllib.request.urlopen('http://www.amazon.it/Shaving-Factory-rasoio-professionali-singolo/dp/B003DRL6KK/ref=sr_1_2?ie=UTF8&qid=1453313892&sr=8-2&keywords=rasoio')
sorgente=sito.read()
print('yup')
This is wrote in another module, for testing purpose, there is only this piece of code. If I have it for its own, it works perfectly, with no error. I tried with other URLs, same thing: if I try the second piece of code, no errors; if I try to pass it to DBmanager
, I got the error. What is going on?
Upvotes: 0
Views: 467
Reputation: 1123410
You re-bound re.compile()
to a string in the setURL
method:
pat=rex.compile=r'.+\.(.*)\.'
This makes two assignments, one to pat
and the same object to rex.compile
(you imported the re
module as rex
).
You probably meant to call re.compile()
there:
pat = rex.compile(r'.+\.(.*)\.')
Because you rebound re.compile
any other code trying to use that function fails with the same exception; you can't call a string.
Upvotes: 2