Reputation: 4122
Before I start, apologies that this is my third attempt at getting across what my issue is. The last two questions seem to have suffered communication breakdown. I am using the following Scrapy code:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from scrapy.spider import BaseSpider
from scrapy import log
from scrapy.cmdline import execute
from scrapy.utils.markup import remove_tags
import time
import re
import json
class ExampleSpider(CrawlSpider):
name = "mrcrawl2"
allowed_domains = ["whoscored.com"]
start_urls = ["http://www.whoscored.com"]
download_delay = 5
rules = [Rule(SgmlLinkExtractor(allow=('/Seasons'),deny=('/News', '/Fixtures', '/Graphics', '/Articles', '/Live', '/Matches', '/Explanations', '/Glossary', '/Players', 'ContactUs', 'TermsOfUse', 'Jobs', 'AboutUs', 'RSS'),), follow=False, callback='parse_item')]
def parse_item(self, response):
sel = Selector(response)
regex = re.compile('DataStore\.prime\(\'history\', { stageId: \d+ },\[\[.*?\]\]?\)?;', re.S) #use regex to find none html data on page
match2g = re.search(regex, response.body)
if match2g is not None:
match3g = match2g.group()
match3g = str(match3g)
match3g = match3g.replace("'", '').replace("'", '').replace('[', '').replace(']', '').replace('] );', '') #replace some characters from returned string
match3g = re.sub("DataStore\.prime\(history, { stageId: \d+ },", '', match3g) #here replacing also
match3g = match3g.replace(');', '') #and here
new_match3g = '' #create empty variable
for line in match3g.split("\n"): #for each line of old string
upl = line.rsplit(",",1)[1:] #split at the last comma
if upl:
upl1 = "{}".format("".join(list(upl[0]))) #new data format
upl2 = "{}".format(",".join(list(upl[0]))) #old data format
upl2 = str(upl2) #convert both to strings
upl1 = str(upl1)
new_match3g += line.replace(upl1, upl2) + '\n' #replace old substring with new one in new string created from old
print "UPL1 = ", upl1 #print new and old substrings to confirm conversion completed correctly
print "UPL2 = ", upl2
print new_match3g.decode('utf-8') #print new and old strings to confirm the new string has been built correctly
print match3g.decode('utf-8')
execute(['scrapy','crawl','mrcrawl2'])
The purpose of this is to take all digits after the last comma in each line of data parsed and separate these bites out using commas.
An example of the conversion taking place would be:
,000
to ,0,0,0,
Once this process is completed, I am then creating a new empty variable 'new_match3g' and filling this, line by line with the old variable 'match3g', only with my new, comma separated string replacing the old, non comma separating one.
I am then printing both the old and new string as well as the before and after transformation substrings to observe whether this is working correctly.
For most examples it does, however seemingly at random, certain substrings generated are not added to 'new_match3g' correctly, even though they have been converted correctly in the variable 'upl2'.
If you were to run this code through yourself and observe some of the output you would see what I mean. What I don't understand is why this is only happening on certain lines, seemingly at random.
Thanks
Upvotes: 0
Views: 1120
Reputation: 180441
s = "foo,bar,foo,foobar"
spl = s.rsplit(",",1)
to_be_updated = spl[1:]
updated = ",".join(to_be_updated[0])
orig = spl[0:1]
final = orig[0] +","+ updated
foo,bar,foo,f,o,o,b,a,r
Upvotes: 1