Reputation: 2237
Trying to scrape this site.
https://www.foodpanda.sg/restaurants/new?lat=1.2915902&lng=103.8379066&vertical=restaurants
for restaurant details.
I need to scroll down to get more restaurants. How to avoid duplicates when new elements are loaded?. As of now it loads the new elements but in the csv most of them are duplicates. I tried something like this which works on other site..but it didn't work out on this.
current_len = len(likes_div.find_elements_by_xpath('//div[@class="q9uorilb"]//a'))
while True:
likes_div.find_element_by_xpath('.//div[@class="q9uorilb"]//a').send_keys(Keys.END)
try:
WebDriverWait(driver, 5).until(
lambda x: len(driver.find_elements_by_xpath('.//div[@class="q9uorilb"]//a')) > current_len)
current_len = len(driver.find_elements_by_xpath('.//div[@class="q9uorilb"]//a'))
except TimeoutException:
name_eles = [name_ele for name_ele in
driver.find_elements_by_xpath('.//div[@class="q9uorilb"]//a')]
This is my code for the web page above.
def get_rest():
time.sleep(15)
restaurant_locator = '//ul[@class="vendor-list"]//li[@data-testid and not(@class)]'
restaurants = driver.find_elements_by_xpath(restaurant_locator)
return restaurants
def get_data(rests):
global header_added
for rest in rests:
try:
name = rest.find_element_by_xpath('.//span[@class="name fn"]').text
except:
name = 'No name'
print(name)
print('*********')
try:
link_a = rest.find_element_by_xpath('.//a')
link = link_a.get_attribute('href')
except:
link = 'No link available'
print(link)
print('**********')
try:
rating = rest.find_element_by_xpath('.//span[@class="rating"]').text
rating = rating[:-2]
except:
rating = 'No Ratings Available'
print(rating)
print('*********')
try:
cuisine = rest.find_element_by_xpath('.//ul[@class="categories summary"]').text
cuisine = cuisine[4:]
except:
cuisine = 'Cuisine Details Not Available'
print(cuisine)
print('***********')
try:
distance = rest.find_element_by_xpath('.//span[@class="badge-info"]').text
except:
distance = "No Distance available"
print(distance)
print('***********')
try:
tags = rest.find_element_by_xpath('.//div[@class="tag-container"]').text
except:
tags = "No special Offers"
print(tags)
print('************')
try:
cashback = rest.find_element_by_xpath('.//span[@class="vendor-cashback-info"]').text
except:
cashback = "No Cashback available"
print(cashback)
dict1 = {'Restaurant Name': name, "Rating": rating, "Cuisine": cuisine, "Delivery Time": distance,
"Tags": tags, "Cashback": cashback}
with open(f'Food_Panda_test.csv', 'a+', encoding='utf-8-sig') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
while True:
p = get_rest()
get_data(p)
Upvotes: 1
Views: 663
Reputation: 2237
I was able to solve this using the idea given by @cruisepandey. Thanks
old_rest = set() #Empty Set
while True:
driver.execute_script("window.scrollBy(0,3825)", "")
time.sleep(15)
restaurant_locator = '//ul[@class="vendor-list"]//li[@data-testid and not(@class)]'
restaurants = driver.find_elements_by_xpath(restaurant_locator) #Find Elements
ans = set(restaurants) - set(old_rest) #Remove old elements
for rest in ans:
driver.execute_script("arguments[0].scrollIntoView();", rest)
try:
name = rest.find_element_by_xpath('.//span[@class="name fn"]').text
except:
name = 'No name'
print(name)
print('*********')
try:
link_a = rest.find_element_by_xpath('.//a')
link = link_a.get_attribute('href')
except:
link = 'No link available'
print(link)
print('**********')
try:
rating = rest.find_element_by_xpath('.//span[@class="rating"]').text
rating = rating[:-2]
except:
rating = 'No Ratings Available'
print(rating)
print('*********')
try:
cuisine = rest.find_element_by_xpath('.//ul[@class="categories summary"]').text
cuisine = cuisine[4:]
except:
cuisine = 'Cuisine Details Not Available'
print(cuisine)
print('***********')
try:
distance = rest.find_element_by_xpath('.//span[@class="badge-info"]').text
except:
distance = "No Distance available"
print(distance)
print('***********')
try:
tags = rest.find_element_by_xpath('.//div[@class="tag-container"]').text
except:
tags = "No special Offers"
print(tags)
print('************')
try:
cashback = rest.find_element_by_xpath('.//span[@class="vendor-cashback-info"]').text
except:
cashback = "No Cashback available"
print(cashback)
dict1 = {'Restaurant Name': name, "URL":link, "Rating": rating, "Cuisine": cuisine, "Delivery Time": distance,
"Tags": tags, "Cashback": cashback}
with open(f'Food_Panda_test.csv', 'a+', encoding='utf-8-sig') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
old_rest = restaurants #assign scraped items to old_rest
Upvotes: 0
Reputation: 29362
I think removing of duplicates should be done once you have restaurant details :
Let's say you have a list restaurants
with duplicates.
Remove duplicates entry like this :
seen = set(restaurants)
if item not in seen:
seen.add(item)
restaurants.append(item)
if you have multiple list then make a generic function and pass list to get the job done.
Update 1:
When you are doing this p = get_rest()
then p is a list.
do this :
seen = set(p)
if item not in seen:
seen.add(item)
p.append(item)
Now all duplicates will get removed.
and then do
get_data(p)
Upvotes: 1