Reputation: 154
I've written a script in python using selenium to get the title of different jobs traversing multiple pages from a webpage. when I run the script, I could notice that selenium is unable to open that webpage. However, I could see the content of that page without any trouble using that very link manually in internet explorer or chrome.
webpage link # If you can't see the content, make sure to refresh the page
I've tried with:
from bs4 import BeautifulSoup
from selenium import webdriver
URL = 'https://www.alljobs.co.il/SearchResultsGuest.aspx?page=1&position=235,330,320,236,1541&type=&city=®ion='
with webdriver.Chrome() as driver:
driver.get(URL)
soup = BeautifulSoup(driver.page_source,'lxml')
while True:
for item in soup.select('[class="job-content-top"]'):
title = item.select_one('.job-content-top-title a[title')
print(title)
try:
next_page = driver.find_elemeny_by_css_selector('.jobs-paging-next > a').click()
soup = BeautifulSoup(driver.page_source,'lxml')
except Exception:
break
I even tried like this but that didn't work either (collected cookie from the browser):
from bs4 import BeautifulSoup
from selenium import webdriver
URL = 'https://www.alljobs.co.il/SearchResultsGuest.aspx?page=1&position=235,330,320,236,1541&type=&city=®ion='
cookie = "_ga=GA1.3.1765365490.1582505881; _gid=GA1.3.568643527.1582505881; _fbp=fb.2.1582505881473.1930545410; _hjid=619e3a88-ee5a-43ca-8a0b-e70b063dcf84; BlockerDisplay=; DiplayPopUpSalarySurvey=; OB-USER-TOKEN=390dca4f-08d0-4f54-bce5-00e7e6aa3e39; LPVID=dkY2EwOTNmZTA4YTM1MDI1; HomePageBlocker=1; rbzsessionid=123a4166f92dc4aeb8e66b510e6736f8; ASP.NET_SessionId=f4v4gictcer0qb3qxfptyz1s; GoogleAnalytics_clientId=1765365490.1582505881; _hjIncludedInSample=1; __za_cd_19761230=%7B%22visits%22%3A%22%5B1582513388%2C1582505890%5D%22%2C%22campaigns_status%22%3A%7B%2212147%22%3A1582505953%7D%2C%22historical_goals%22%3A%7B%2254918.54919%22%3A1%2C%2254918.54920%22%3A1%7D%7D; __za_19761230=%7B%22sId%22%3A79638407%2C%22dbwId%22%3A%221%22%2C%22sCode%22%3A%229c1bb50dc33a43741f30126a710f85be%22%2C%22sInt%22%3A5000%2C%22aLim%22%3A1000%2C%22asLim%22%3A100%2C%22na%22%3A0%2C%22td%22%3A1%2C%22ca%22%3A%221%22%7D; __za_cds_19761230=%7B%22data_for_campaign%22%3A%7B%22country%22%3A%22BD%22%2C%22language%22%3A%22EN%22%2C%22ip%22%3A%2245.118.60.18%22%2C%22start_time%22%3A1582513384000%7D%7D; Eloqua=6292d5d5-0f5c-4d0d-8899-a8441f467f38; UserLeaveBlocker=1; googtrans=/iw/en; rbzid=Ywp3JQVmhLXmkPbvxMHFcoE2QLA/Pgp9+eoVrkplSyW/kRRXVTIl8UHjpYV1AVBEvxuHW6KVr8P0N1dlQxqo/TvVEMJ1XfqWI6BZ2HFAUOmgpFx0ubceolso09c3RxkYYHyT1VN6F8nK+skcbbendfbahodU69GZoOGvLaOs142Ws7gmiO9j3AMUflwI3oBKBX/U4Qv8ueUdFb17+mkYKg==; MitugLastZone=200; _gat=1"
options = webdriver.ChromeOptions()
options.add_argument("--User-Agent=Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36")
options.add_argument("--Referer=https://www.alljobs.co.il/SearchResultsGuest.aspx?page=1&position=235,330,320,236,1541&type=&city=®ion=")
options.add_argument(f"--Cookie={cookie}")
driver = webdriver.Chrome(options=options)
driver.get(URL)
How can I parse the title of different jobs traversing multiple pages?
Upvotes: 1
Views: 328
Reputation: 1556
It looks like this website can detect Selenium and prevents you from seeing the content.
There are a few ways around this, basically, you can try to make your Selenium undetectable, a good summary on possible ways to do it:
The solution that worked for me:
Use Firefox (I used v 72.0.2).
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
URL = 'https://www.alljobs.co.il/SearchResultsGuest.aspx?page=1&position=235,330,320,236,1541&type=&city=®ion='
# Set proxy
PROXY_HOST = "12.12.12.123"
PROXY_PORT = "1234"
profile = FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.http", PROXY_HOST)
profile.set_preference("network.proxy.http_port", int(PROXY_PORT))
profile.set_preference("dom.webdriver.enabled", False)
profile.set_preference('useAutomationExtension', False)
profile.update_preferences()
# Set profile
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(URL)
time.sleep(10) # wait for your site to load
soup = BeautifulSoup(driver.page_source,'lxml')
print(soup)
while True:
for item in soup.select('[class="job-content-top"]'):
title = item.select_one('.job-content-top-title a[title]')
print(title)
try:
next_page = driver.find_element_by_css_selector('.jobs-paging-next > a').click()
soup = BeautifulSoup(driver.page_source,'lxml')
except Exception:
break
The solution with Chrome:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ChromeOptions
URL = 'https://www.alljobs.co.il/SearchResultsGuest.aspx?page=1&position=235,330,320,236,1541&type=&city=®ion='
options = ChromeOptions()
# This hides Selenium:
options.add_argument('disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(options=options)
driver.get(URL)
time.sleep(5) # wait for it to load
soup = BeautifulSoup(driver.page_source,'lxml')
print(soup)
while True:
for item in soup.select('[class="job-content-top"]'):
title = item.select_one('.job-content-top-title a[title]')
print(title)
try:
next_page = driver.find_element_by_css_selector('.jobs-paging-next > a').click()
soup = BeautifulSoup(driver.page_source,'lxml')
except Exception:
break
Upvotes: 5
Reputation: 3610
To drive Chrome or Chromium, you have to download chromedriver and put it in a folder that is on your system’s path.
Remember that you have to set the path to the chromedriver executable. This is possible using the following line:
Chrome(executable_path='/path/to/chromedriver')
You can check more documentation here
Upvotes: 1