Reputation: 21
I want to make a dataset from the contents of notion pages that exist in a Notion Table in a Notion Site.
I have managed to write a script that does exactly what I want to do which is:
Open
button visibleOpen
button to open the notion pageMy problem is that when the page is loaded the number of rows that are visible are only 26 or if I make the window bigger 28, out of the 47 rows of the table. I have tried scrolling to the bottom but still my script cannot detect more that 28 rows. My function here works for locating the not visible elements after the 16th:
def extract_content_from_cell(driver: webdriver.Chrome, row_number: int) -> str:
"""
Extracts the content from a single table cell and returns the text content.
"""
print(f"Processing cell {row_number}...")
cell_xpath = f"//*[@id='notion-app']/div/div[1]/div/div[1]/main/div/div/div[3]/div[2]/div/div/div/div[3]/div[2]/div[{row_number}]/div/div[1]/div/div[2]/div/div"
print(f"Locating cell {row_number}...")
try:
cell_element = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, cell_xpath))
)
print(f"Cell {row_number} located successfully.")
except Exception as e:
print(f"Error locating cell {row_number}: {e}")
return ""
# if the row number is greater than 16, start scrolling the notion container
if row_number > 16:
for _ in range(10): # scroll the container up to 10 times, each time by 40 pixels
try:
scroll_notion_container(driver, cell_element, 40)
print(f"Hovered over cell {row_number}.")
break # stop scrolling once we successfully hover over the cell
except Exception as e:
print(
f"Scrolling down to bring cell {row_number} into view: {e}")
# hover over the cell again after scrolling
hover_over_element(driver, cell_element)
# locate and click the 'Open in side peek' button
print(f"Locating the 'Open in side peek' button for cell {row_number}...")
try:
open_button = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable(
(By.XPATH, "//div[@aria-label='Open in side peek']"))
)
print(
f"Clicking the 'Open in side peek' button for cell {row_number}...")
open_button.click()
except Exception as e:
print(f"Button not visible for cell {row_number}, error: {e}")
return ""
time.sleep(4)
# extract the text content from the opened page
print(f"Extracting text from the side page for cell {row_number}...")
try:
content_element = WebDriverWait(driver, 15).until(
EC.presence_of_element_located(
(By.CLASS_NAME, "notion-page-content"))
)
page_text = content_element.text
print(f"Extracted text content for cell {row_number}.")
return page_text
except Exception as e:
print(f"Error extracting content from cell {row_number}: {e}")
return ""
My problem is probably that the rows are not located in the first place.
def get_total_rows(driver: webdriver.Chrome,
table_xpath: str) -> int:
"""
Returns the total number of rows in the Notion table.
"""
print("Determining the total number of rows in the table...")
rows = driver.find_elements(By.XPATH, table_xpath)
total_rows = len(rows)
print(f"Total rows in the table: {total_rows}")
return total_rows
Has anyone any idea on how I might solve this issue? I could do manually the 47 entries but I want to do the same for a table with 400 rows.
Upvotes: 1
Views: 272
Reputation: 635
Here is the solution using the Python requests
module(As you told me to drop the answer here)
Using the API endpoints of your target apps can solve the problem, here are the details
https://web3sec.notion.site/api/v3/syncRecordValuesSpace
fetched space_id
and collection_id
from the server.https://web3sec.notion.site/api/v3/queryCollection?src=initial_load
fetched the block_id
of all 47 rows.https://web3sec.notion.site/api/v3/loadCachedPageChunk
fetched full chunk data of each block_id
uuid
, requests
, urllib.parse.quote
I include all the details in the code as a comment form,
#!/usr/bin/env python3
import uuid
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from urllib.parse import quote
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) #disable urllib HTTP warning
url = 'https://web3sec.notion.site/api/v3/syncRecordValuesSpace'
query_url = 'https://web3sec.notion.site/api/v3/queryCollection?src=initial_load'
chunk_url = 'https://web3sec.notion.site/api/v3/loadCachedPageChunk'
headers = {
'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
}
def getJsonValue(content, block, count, space_id):
poc_name = content['recordMap']['block'][block]['value']['properties']['ubeE'][0][0]
poc_link = content['recordMap']['block'][block]['value']['properties']['ubeE'][0][1][0][1]
title = content['recordMap']['block'][block]['value']['properties']['title'][0][0]
print(f"\n================================================\nDISCUSSION NO: {count}\nDISCUSSION TITLE: {title}\nTARGET DISCUSSION POC SCRIPT NAME: {poc_name}\nTARGET DISCUSSION POC SCRIPT LINK: {poc_link}\nDISCUSSION WITH DESCRIPTION: \n-----")
contents = content['recordMap']['block'][block]['value']['content']
for blocks in contents:
try:
content_blocks = content['recordMap']['block'][blocks]['value']['properties']['title']
for i in content_blocks:
content_data = i[0]
print(content_data)
source_pic = content['recordMap']['block'][blocks]['value']['format']['display_source'] #get source pic URL and urllib.parse.quote for encoding the URL value
print(f"------------\nSOURCE PICTURE: https://web3sec.notion.site/image/{quote(source_pic, safe='')}?table=block&id={blocks}&spaceId={space_id}&width=1620&userId=&cache=v2\n")
except Exception:
pass
def sendRequest(string_val):
val_uuid = uuid.UUID(string_val) #convert 32 char hex to uuid
data = {"requests":[{"pointer":{"table":"block","id":f"{val_uuid}"},"version":-1}]}
session = requests.Session()
#first request to get the collection_id and space_id
resp = session.post(url, headers=headers, json=data, verify=False).json()
get_id = resp['recordMap']['block'][f'{val_uuid}']['value']['view_ids'][0]
collection_id = resp['recordMap']['block'][f'{val_uuid}']['value']['collection_id']
space_id = resp['recordMap']['block'][f'{val_uuid}']['value']['space_id']
#second request to get block_id of all 47 rows
second_response = {"source":{"type":"collection","id":f"{collection_id}","spaceId":f"{space_id}"},"collectionView":{"id":f"{get_id}","spaceId":f"{space_id}"},"loader":{"reducers":{"collection_group_results":{"type":"results","limit":50}},"searchQuery":"","userTimeZone":"Asia/Dhaka"}}
data = second_response
resp = session.post(query_url, json=data, headers=headers, verify=False).json()
get_block = resp['result']['reducerResults']['collection_group_results']['blockIds']
#third part to get chunk data of all 47 rows
for n, block_id in enumerate(get_block): #'n+=1' used for getting row count
data = {"page":{"id":f"{block_id}","spaceId":f"{space_id}"},"limit":50,"cursor":{"stack":[]},"verticalColumns":False}
resp = session.post(chunk_url, json=data, headers=headers, verify=False).json()
getJsonValue(resp, block_id, n+1, space_id)
sendRequest('b201fe69f84e4050bf3915c6030f0fdf') #this 32 char hex value from your URL: https://web3sec.notion.site/b201fe69f84e4050bf3915c6030f0fdf
Upvotes: 0