meowlicious
meowlicious

Reputation: 21

How to scrape a notion page contained in a table cell of a Notion Site with selenium?

I want to make a dataset from the contents of notion pages that exist in a Notion Table in a Notion Site.

I have managed to write a script that does exactly what I want to do which is:

  1. Open the site
  2. Go to each row of the table
  3. Hover on the right side to make the Open button visible
  4. Click the Open button to open the notion page
  5. Get the content of the notion page
  6. Move to the next cell below and repeat from step 3

My problem is that when the page is loaded the number of rows that are visible are only 26 or if I make the window bigger 28, out of the 47 rows of the table. I have tried scrolling to the bottom but still my script cannot detect more that 28 rows. My function here works for locating the not visible elements after the 16th:

def extract_content_from_cell(driver: webdriver.Chrome, row_number: int) -> str:
    """
    Extracts the content from a single table cell and returns the text content.
    """

    print(f"Processing cell {row_number}...")

    cell_xpath = f"//*[@id='notion-app']/div/div[1]/div/div[1]/main/div/div/div[3]/div[2]/div/div/div/div[3]/div[2]/div[{row_number}]/div/div[1]/div/div[2]/div/div"
    print(f"Locating cell {row_number}...")

    try:
        cell_element = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, cell_xpath))
        )
        print(f"Cell {row_number} located successfully.")
    except Exception as e:
        print(f"Error locating cell {row_number}: {e}")
        return ""

    # if the row number is greater than 16, start scrolling the notion container
    if row_number > 16:
        for _ in range(10):  # scroll the container up to 10 times, each time by 40 pixels
            try:
                scroll_notion_container(driver, cell_element, 40)
                print(f"Hovered over cell {row_number}.")
                break  # stop scrolling once we successfully hover over the cell
            except Exception as e:
                print(
                    f"Scrolling down to bring cell {row_number} into view: {e}")

    # hover over the cell again after scrolling
    hover_over_element(driver, cell_element)

    # locate and click the 'Open in side peek' button
    print(f"Locating the 'Open in side peek' button for cell {row_number}...")

    try:
        open_button = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable(
                (By.XPATH, "//div[@aria-label='Open in side peek']"))
        )
        print(
            f"Clicking the 'Open in side peek' button for cell {row_number}...")
        open_button.click()
    except Exception as e:
        print(f"Button not visible for cell {row_number}, error: {e}")
        return ""

    time.sleep(4)

    # extract the text content from the opened page
    print(f"Extracting text from the side page for cell {row_number}...")
    try:
        content_element = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located(
                (By.CLASS_NAME, "notion-page-content"))
        )
        page_text = content_element.text
        print(f"Extracted text content for cell {row_number}.")
        return page_text
    except Exception as e:
        print(f"Error extracting content from cell {row_number}: {e}")
        return ""

My problem is probably that the rows are not located in the first place.

def get_total_rows(driver: webdriver.Chrome,
                   table_xpath: str) -> int:
    """
    Returns the total number of rows in the Notion table.
    """

    print("Determining the total number of rows in the table...")
    rows = driver.find_elements(By.XPATH, table_xpath)
    total_rows = len(rows)
    print(f"Total rows in the table: {total_rows}")
    return total_rows

Has anyone any idea on how I might solve this issue? I could do manually the 47 entries but I want to do the same for a table with 400 rows.

Upvotes: 1

Views: 272

Answers (1)

x1337Loser
x1337Loser

Reputation: 635

Here is the solution using the Python requests module(As you told me to drop the answer here)

Using the API endpoints of your target apps can solve the problem, here are the details

API Endpoint:

  • https://web3sec.notion.site/api/v3/syncRecordValuesSpace fetched space_id and collection_id from the server.
  • https://web3sec.notion.site/api/v3/queryCollection?src=initial_load fetched the block_id of all 47 rows.
  • https://web3sec.notion.site/api/v3/loadCachedPageChunk fetched full chunk data of each block_id

Code:

  • Module used: uuid, requests, urllib.parse.quote

I include all the details in the code as a comment form,

#!/usr/bin/env python3

import uuid
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from urllib.parse import quote
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) #disable urllib HTTP warning

url = 'https://web3sec.notion.site/api/v3/syncRecordValuesSpace'
query_url = 'https://web3sec.notion.site/api/v3/queryCollection?src=initial_load'
chunk_url = 'https://web3sec.notion.site/api/v3/loadCachedPageChunk'

headers = {
    'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
}

def getJsonValue(content, block, count, space_id):

    poc_name = content['recordMap']['block'][block]['value']['properties']['ubeE'][0][0]
    poc_link = content['recordMap']['block'][block]['value']['properties']['ubeE'][0][1][0][1]
    title = content['recordMap']['block'][block]['value']['properties']['title'][0][0]
    print(f"\n================================================\nDISCUSSION NO:  {count}\nDISCUSSION TITLE:   {title}\nTARGET DISCUSSION POC SCRIPT NAME:  {poc_name}\nTARGET DISCUSSION POC SCRIPT LINK:  {poc_link}\nDISCUSSION WITH DESCRIPTION:  \n-----")
                
    contents = content['recordMap']['block'][block]['value']['content']
    for blocks in contents:
        try:
            content_blocks = content['recordMap']['block'][blocks]['value']['properties']['title']
            for i in content_blocks:
                content_data = i[0]
                print(content_data)
            source_pic = content['recordMap']['block'][blocks]['value']['format']['display_source'] #get source pic URL and urllib.parse.quote for encoding the URL value
            print(f"------------\nSOURCE PICTURE: https://web3sec.notion.site/image/{quote(source_pic, safe='')}?table=block&id={blocks}&spaceId={space_id}&width=1620&userId=&cache=v2\n")
            
        except Exception:
            pass


def sendRequest(string_val):
    val_uuid = uuid.UUID(string_val) #convert 32 char hex to uuid
    data = {"requests":[{"pointer":{"table":"block","id":f"{val_uuid}"},"version":-1}]}
    session = requests.Session()

    #first request to get the collection_id and space_id
    resp = session.post(url, headers=headers, json=data, verify=False).json()
    get_id = resp['recordMap']['block'][f'{val_uuid}']['value']['view_ids'][0]
    collection_id = resp['recordMap']['block'][f'{val_uuid}']['value']['collection_id']
    space_id = resp['recordMap']['block'][f'{val_uuid}']['value']['space_id']

    #second request to get block_id of all 47 rows
    second_response = {"source":{"type":"collection","id":f"{collection_id}","spaceId":f"{space_id}"},"collectionView":{"id":f"{get_id}","spaceId":f"{space_id}"},"loader":{"reducers":{"collection_group_results":{"type":"results","limit":50}},"searchQuery":"","userTimeZone":"Asia/Dhaka"}}
    data = second_response
    resp = session.post(query_url, json=data, headers=headers, verify=False).json()
    get_block = resp['result']['reducerResults']['collection_group_results']['blockIds']
    
    #third part to get chunk data of all 47 rows
    for n, block_id in enumerate(get_block): #'n+=1' used for getting row count
        data = {"page":{"id":f"{block_id}","spaceId":f"{space_id}"},"limit":50,"cursor":{"stack":[]},"verticalColumns":False}
        resp = session.post(chunk_url, json=data, headers=headers, verify=False).json()
        getJsonValue(resp, block_id, n+1, space_id)
    

sendRequest('b201fe69f84e4050bf3915c6030f0fdf') #this 32 char hex value from your URL: https://web3sec.notion.site/b201fe69f84e4050bf3915c6030f0fdf

Upvotes: 0

Related Questions