Reputation: 743
After solve a problem with a loop that allow me extract images and test from a website, I'm having another problem, when I try to save the text that I extract in a new row in a csv file.
What I'm doing is search a div with the class "description", then I select the text that interest me, print the data (to check that everythings is correct) and finally use a writerow with the data extracted (previously I opened the file and add a row with the header).
Edit: my problem is that it's only saving one row, the last that the script have searched and extracted. I don't know what I'm doing wrong. I'm going to put the two functions that I have in the script:
main() is where I have what I said before.
def main(url, destino):
""" Acceso al sitio web """
soup = bs(urlopen(url), 'lxml')
parsed = list(urlparse.urlparse(url))
""" Acceso al archivo csv """
fileName = 'datos/datos.csv'
print fileName
f = csv.writer(open(fileName, 'w'))
f.writerow(["Lote", "Dato del lote", "Detalles"]) # Header
""" Acceso a la descr. y escritura en el csv """
description = soup.findAll(True, {'class':['description']})
for text in description:
loteNum = text.contents[1]
loteDat = text.contents[3]
detalle = text.contents[6]
detalleE = detalle.encode("utf-8")
print loteNum
print loteDat
print detalle
f.writerow([loteNum, loteDat, detalleE])
""" Descarga de las img. """
for image in soup.findAll(True, {'class':['list_logo']}):
print "Image: %(src)s" % image
image_url = urlparse.urljoin(url, image['src'])
filename = image["src"].split("/")[-1]
outpath = os.path.join(destino, filename)
urlretrieve(image_url, outpath)
getUrl() allow me works in a determined range of images that I want to extract. I put here because I don't know if the problem could be something from this function.
def getUrl(opt, baseUrl):
destino = "/home/ivanhercaz/monedasWiki/img"
print "Instrucciones del script \n No te preocupes, no es complicado pero atiende a los pasos"
print "Introduce 1 para obtener los archivos del 00001 al 00010"
print "Introduce 2 para obtener los archivos del 00010 al 00099"
print "Introduce 3 para obtener los archivos del 00100 al 00999"
print "Introduce 4 para obtener los archivos del 01000 al 09999"
print "Introduce 5 para obtener los archivos del 10000 al 19999"
optSel = int(input(opt))
# i es el rango
# urlI es la transformacion de i en cadena
# baseUrl es el enlace al sitio web de Pliego
# url es la url completa con los parametros necesarios
if optSel == 1:
try:
for i in range(0,10):
r = str(0).zfill(4)
urlI = str(i)
url = baseUrl + r + urlI
main(url, destino)
except ValueError:
print "Introduce el rango correcto"
elif optSel == 2:
try:
for i in range(10,100):
r = str(0).zfill(3)
urlI = str(i)
url = baseUrl + r + urlI
main(url, destino)
except ValueError:
print "Introduce el rango correcto"
elif optSel == 3:
try:
for i in range(100,1000):
r = str(0).zfill(2)
urlI = str(i)
url = baseUrl + r + urlI
main(url, destino)
except ValueError:
print "Introduce el rango correcto"
elif optSel == 4:
try:
for i in range(1000,10000):
r = str(0).zfill(1)
urlI = str(i)
url = baseUrl + r + urlI
main(url, destino)
except ValueError:
print "Introduce el rango correcto"
elif optSel == 2:
try:
for i in range(10000,18510):
urlI = str(i)
url = baseUrl + r + urlI
main(url, destino)
except ValueError:
print "Introduce el rango correcto"
elif optSel < 0:
print "Valor inferior a 0"
else:
print "Algo ha salido mal"
Both functions are in the same file. If you could tell me what it's going wrong I'll be very thankful.
Edit: I have change the way to open and write the file like Moses Koledoye commented, but the script just write the last text checked yet. I think that the problem is something related with the loop to check and add the rows with the text, but I'm not finding the way to solve it. I share again the main().
<!-- language: python -->
def main(url, destino):
""" Acceso al sitio web """
soup = bs(urlopen(url), 'lxml')
parsed = list(urlparse.urlparse(url))
""" Acceso al archivo csv """
fileName = 'datos/datos.csv'
print fileName
""" Acceso a la descr. y escritura en el csv """
description = soup.findAll(True, {'class':['description']})
for text in description:
loteNum = text.contents[1]
loteDat = text.contents[3]
detalle = text.contents[6]
detalleE = detalle.encode("utf-8")
print loteNum
print loteDat
print detalle
header = ["Lote", "Dato del lote", "Detalles"]
data = [loteNum, loteDat, detalleE]
with open(fileName, 'w') as f:
f = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
f.writerow(header)
f.writerow(data)
""" Descarga de las img. """
for image in soup.findAll(True, {'class':['list_logo']}):
print "Image: %(src)s" % image
image_url = urlparse.urljoin(url, image['src'])
filename = image["src"].split("/")[-1]
outpath = os.path.join(destino, filename)
urlretrieve(image_url, outpath)
Upvotes: 2
Views: 196
Reputation: 13459
for text in description:
# ... some functionality
data = [loteNum, loteDat, detalleE]
with open(fileName, 'w') as f:
f = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
f.writerow(header)
f.writerow(data)
Every time you iterate over the description
you open the file now in w
rite mode, which will overwrite the previous contents.
You can either change it (the mode with which the file is opened) to a
ppend, or just open the file to write to outside of the loop, like this:
with open(fileName, 'w') as f:
f = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
header = ["Lote", "Dato del lote", "Detalles"]
f.writerow(header)
for text in description:
loteNum, loteDat, detalle = [text.contents[i] for i in (1, 3, 6)]
detalleE = detalle.encode("utf-8")
print loteNum, loteDat, detalle
data = [loteNum, loteDat, detalleE]
f.writerow(data)
Upvotes: 1