Reputation: 1785
I have thousands of html files stored in a remote directory. All these files have same HTML structure. Right now I am scraping every file manually with the following script
from string import punctuation, whitespace
import urllib2
import datetime
import re
from bs4 import BeautifulSoup as Soup
import csv
today = datetime.date.today()
html = urllib2.urlopen("http://hostname/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html").read()
soup = Soup(html)
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
So the above script is for one URL. Like wise I wanna scrape through all the html files which are under that directory irrespective of the file names. I do not find that this question has been asked.
Update : Code
import urllib2
import BeautifulSoup
import re
Newlines = re.compile(r'[\r\n]\s+')
def getPageText(url):
# given a url, get page content
data = urllib2.urlopen(url).read()
# parse as html structured document
bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
# kill javascript content
for li in bs.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
def main():
urls = [
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html'
]
txt = [getPageText(url) for url in urls]
if __name__=="__main__":
main()
Upvotes: 0
Views: 2294
Reputation: 369274
Use loop:
...
for url in url_list:
html = urllib2.urlopen(url).read()
soup = Soup(html)
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
If you don't know url list in advance, you have to parse listing page.
import csv
import urllib2
import BeautifulSoup
def getPageText(url, filename):
data = urllib2.urlopen(url).read()
bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
with open(filename, 'w') as f:
writer = csv.writer(f)
for li in bs.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
sSpan = li.find('span', attrs={'class':'st'})
writer.writerow([sLink['href'], sSpan])
def main():
urls = [
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html',
]
for i, url in enumerate(urls, 1):
getPageText(url, '{}.csv'.format(i))
if __name__=="__main__":
main()
Upvotes: 1