Reputation: 13
I am trying to build a spider, that gathers information regarding startups. Therefore I wrote a Python script with scrapy that should access the website and store the information in a dictionary. I think the code should work from a logik point of view, but somehow I do not get any output. My code:
import scrapy
class StartupsSpider(scrapy.Spider):
name = 'startups'
#name of the spider
allowed_domains = ['www.bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#list of allowed domains
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#starting url
def parse(self, response):
startups = response.xpath('//*[contains(@class,"card-link-overlay")]/@href').getall()
#parse initial start URL for the specific startup URL
for startup in startups:
absolute_url = response.urljoin(startup)
yield scrapy.Request(absolute_url, callback=self.parse_startup)
#parse the actual startup information
next_page_url = response.xpath('//*[@class ="pagination-link"]/@href').get()
#link to next page
absolute_next_page_url = response.urljoin(next_page_url)
#go through all pages on start URL
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
#get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[@class="document-info-item"]/a/@href').get()
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[@class="date"]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[@class="date"]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[@class ="person-contact"]/p/a/span/text()').get()
contact_address_street = response.xpath('//*[@class ="adr"]/text()').get()
contact_address_plz = response.xpath('//*[@class ="locality"]/text()').getall()
contact_state = response.xpath('//*[@class ="country-name"]/text()').get()
yield{'Startup':startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter':employees,
'Kapital Bedarf':capital,
'Datum des Förderbescheids':applied_for_invest,
'Contact': contact_name,
'Telefon':contact_phone,
'E-Mail':contact_mail,
'Adresse': contact_address_street + contact_address_plz + contact_state}
Upvotes: 0
Views: 71
Reputation: 4822
allowed_domains
is wrong.Adresse
), you're trying to concatenate list
and str
types so you'll get an error.None
for some of the values and you're trying to get their i'th character which results in an error.I fixed 1, 2, and 3. But you'll need to fix number 4 yourself.
import scrapy
class StartupsSpider(scrapy.Spider):
# name of the spider
name = 'startups'
# list of allowed domains
allowed_domains = ['bmwk.de']
# starting url
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
def parse(self, response):
# parse initial start URL for the specific startup URL
startups = response.xpath('//*[contains(@class,"card-link-overlay")]/@href').getall()
for startup in startups:
absolute_url = response.urljoin(startup)
# parse the actual startup information
yield scrapy.Request(absolute_url, callback=self.parse_startup)
# link to next page
next_page_url = response.xpath('(//*[@class ="pagination-link"])[last()]/@href').get()
if next_page_url:
# go through all pages on start URL
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
# get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[@class="document-info-item"]/a/@href').get()
# for example for some of the pages you'll get an error here:
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[@class="date"]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[@class="date"]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[@class ="person-contact"]/p/a/span/text()').get()
Adresse = ' '.join(response.xpath('//*[@class ="address"]//text()').getall())
yield {'Startup': startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter': employees,
'Kapital Bedarf': capital,
'Datum des Förderbescheids': applied_for_invest,
'Contact': contact_name,
'Telefon': contact_phone,
'E-Mail': contact_mail,
'Adresse': Adresse}
Upvotes: 1
Reputation: 1
you need to run in prompt: scrapy crawl -o filename.(json or csv)
Upvotes: 0