Reputation: 3
I'm attempting to scrape a website to get a very rough demographic of it's users (no personally identifying information or photos), but the tutorial spider from the official documentation I've modified is repeating the same line of output 4 times in a row.
A copy of the code I'm using is below:
Note that the example profile I've included in the code is a fake/spam account. In the case where it may have already been deleted, you can replace the url with any other on the site and it will work again.
import scrapy
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
for container in response.xpath('//div[@class="user-details-wide"]'):
yield {
'Gender': response.xpath("//span[@id='gender']/text()").extract_first(),
'Age': response.xpath("//span[@id='age']/text()").extract_first(),
'State': response.xpath("//span[@id='state_id']/text()").extract_first(),
'Marital status': response.xpath("//span[@id='maritalstatus']/text()").extract_first(),
'Body': response.xpath("//span[@id='body']/text()").extract_first(),
'Height': response.xpath("//span[@id='height']/text()").extract_first(),
'Ethnicity': response.xpath("//span[@id='ethnicity']/text()").extract_first(),
'Does drugs?': response.xpath("//span[@id='drugs']/text()").extract_first(),
'Smokes?': response.xpath("//span[@id='smoke']/text()").extract_first(),
'Drinks?': response.xpath("//span[@id='drink']/text()").extract_first(),
'Has children?': response.xpath("//span[@id='haschildren']/text()").extract_first(),
'Wants children?': response.xpath("//span[@id='wantchildren']/text()").extract_first(),
'Star sign': response.xpath("//span[@id='zodiac']/text()").extract_first(),
'Education': response.xpath("//span[@id='college_id']/text()").extract_first(),
'Personality': response.xpath("//span[@id='fishtype']/text()").extract_first(),
}
Running as follows:
scrapy crawl date -o date.scv
The output I'm looking for is one row of headers followed by one line of results straight after it, not the whitespace and duplicates I'm currently getting.
Upvotes: 0
Views: 403
Reputation: 1549
You don't need to use for loop. Simply find a span element and extract all data from him.
Also, I suggest you use scrapy items it's more convenient. One way to clean extracted data from whitespace is to use xpath function normalize-space().
import scrapy
from items import DateSpiderItem
class DateSpider(scrapy.Spider):
name = "date"
start_urls = [
'http://www.pof.com/viewprofile.aspx?profile_id=141659067',
]
def parse(self, response):
item = DateSpiderItem()
item['Gender'] = response.xpath(
"//span[@id='gender']/text()").extract_first()
item['Age'] = response.xpath(
"//span[@id='age']/text()").extract_first()
item['State'] = response.xpath(
"//span[@id='state_id']/text()").extract_first()
item['Marital_status'] = response.xpath(
"normalize-space(//span[@id='maritalstatus']/text())").extract_first()
item['Body'] = response.xpath(
"//span[@id='body']/text()").extract_first()
item['Height'] = response.xpath(
"//span[@id='height']/text()").extract_first()
item['Ethnicity'] = response.xpath(
"//span[@id='ethnicity']/text()").extract_first()
item['Does_drugs'] = response.xpath(
"normalize-space(//span[@id='drugs']/text())").extract_first()
item['Smokes'] = response.xpath(
"//span[@id='smoke']/text()").extract_first()
item['Drinks'] = response.xpath(
"normalize-space(//span[@id='drink']/text())").extract_first()
item['Has_children'] = response.xpath(
"normalize-space(//span[@id='haschildren']/text())").extract_first()
item['Wants_children'] = response.xpath(
"normalize-space(//span[@id='wantchildren']/text())").extract_first()
item['Star_sign'] = response.xpath(
"//span[@id='zodiac']/text()").extract_first()
yield item
Items file:
class DateSpiderItem(scrapy.Item):
Gender = scrapy.Field()
Age = scrapy.Field()
State = scrapy.Field()
Marital_status = scrapy.Field()
Body = scrapy.Field()
Height = scrapy.Field()
Ethnicity = scrapy.Field()
Does_drugs = scrapy.Field()
Smokes = scrapy.Field()
Drinks = scrapy.Field()
Has_children = scrapy.Field()
Wants_children = scrapy.Field()
Star_sign = scrapy.Field()
Education = scrapy.Field()
Personality = scrapy.Field()
Output:
Upvotes: 1