Reputation: 7
I am trying to crawl data using Selenium in Colab. But it's hard to find a reason why it doesn't crawl.
WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally. (unknown error: DevToolsActivePort file doesn't exist) (The process started from chrome location /usr/bin/chromium-browser is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
This is an error message. I don't know what to do.
I know that it is difficult to review as my code is to crawl the Korean website(naver).
I'm sorry, but I hope you can give me a hint to fix the error. Thanks!
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install Selenium
!pip install webdriver_manager
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time
import urllib
from datetime import datetime
from tqdm import tqdm
import os
# chromdriver setting
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)
def get_article_info(driver, crawl_date, press_list, title_list, link_list, date_list, more_news_base_url=None, more_news=False):
more_news_url_list = []
while True:
page_html_source = driver.page_source
url_soup = BeautifulSoup(page_html_source, 'lxml')
more_news_infos = url_soup.select('a.news_more')
if more_news:
for more_news_info in more_news_infos:
more_news_url = f"{more_news_base_url}{more_news_info.get('href')}"
more_news_url_list.append(more_news_url)
article_infos = url_soup.select("div.news_area")
if not article_infos:
break
for article_info in article_infos:
press_info = article_info.select_one("div.info_group > a.info.press")
if press_info is None:
press_info = article_info.select_one("div.info_group > span.info.press")
article = article_info.select_one("a.news_tit")
press = press_info.text.replace("언론사 선정", "")
title = article.get('title')
link = article.get('href')
press_list.append(press)
title_list.append(title)
link_list.append(link)
date_list.append(crawl_date)
time.sleep(2.0)
next_button_status = url_soup.select_one("a.btn_next").get("aria-disabled")
if next_button_status == 'true':
break
time.sleep(1.0)
next_page_btn = driver.find_element(By.CSS_SELECTOR, "a.btn_next").click()
return press_list, title_list, link_list, more_news_url_list
def get_naver_news_info_from_selenium(keyword, save_path, target_date, ds_de, sort=0, remove_duplicate=False):
crawl_date = f"{target_date[:4]}.{target_date[4:6]}.{target_date[6:]}"
driver = webdriver.Chrome('/usr/lib/chromium-browser/chromedriver', options=options) # chromedriver 파일 경로?
encoded_keyword = urllib.parse.quote(keyword)
url = f"https://search.naver.com/search.naver?where=news&query={encoded_keyword}&sm=tab_opt&sort={sort}&photo=0&field=0&pd=3&ds={ds_de}&de={ds_de}&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so%3Ar%2Cp%3Afrom{target_date}to{target_date}&is_sug_officeid=0"
more_news_base_url = "https://search.naver.com/search.naver"
driver.get(url)
press_list, title_list, link_list, date_list, more_news_url_list = [], [], [], [], []
press_list, title_list, link_list, more_news_url_list = get_article_info(driver=driver,
crawl_date=crawl_date,
press_list=press_list,
title_list=title_list,
link_list=link_list,
date_list=date_list,
more_news_base_url=more_news_base_url,
more_news=True)
driver.close()
if len(more_news_url_list) > 0:
print(len(more_news_url_list))
more_news_url_list = list(set(more_news_url_list))
print(f"->{len(more_news_url_list)}")
for more_news_url in more_news_url_list:
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
driver.get(more_news_url)
press_list, title_list, link_list, more_news_url_list = get_article_info(driver=driver,
crawl_date=crawl_date,
press_list=press_list,
title_list=title_list,
link_list=link_list,
date_list=date_list)
driver.close()
article_df = pd.DataFrame({"날짜": date_list, "언론사": press_list, "제목": title_list, "링크": link_list})
print(f"extract article num : {len(article_df)}")
if remove_duplicate:
article_df = article_df.drop_duplicates(['링크'], keep='first')
print(f"after remove duplicate -> {len(article_df)}")
article_df.to_excel(save_path, index=False)
def crawl_news_data(keyword, year, month, start_day, end_day, save_path):
for day in tqdm(range(start_day, end_day+1)):
date_time_obj = datetime(year=year, month=month, day=day)
target_date = date_time_obj.strftime("%Y%m%d")
ds_de = date_time_obj.strftime("%Y.%m.%d")
get_naver_news_info_from_selenium(keyword=keyword, save_path=f"{save_path}/{keyword}/{target_date}_{keyword}_.xlsx", target_date=target_date, ds_de=ds_de, remove_duplicate=False)
keywords = ['사회서비스']
save_path = "/content/naver_news_article"
for keyword in keywords:
os.makedirs(f"{save_path}/{keyword}")
for keyword in keywords:
print(f"start keyword - {keyword} crawling ...")
crawl_news_data(keyword=keyword, year=2022, month=1, start_day=1, end_day=2, save_path=save_path)
Upvotes: 0
Views: 398
Reputation: 92
I usually run Selenium on Colab and in my setup I don't use webriver_manager
pip install selenium
!apt-get update;
!apt install chromium-chromedriver;
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=options)
This is my basic setup to make it work
Upvotes: 0