Reputation: 43
I am trying web scraping though cloud function->PubSub--> BigQuery
I wrote a python code and I deployed my code to cloud functions. Text results of this code turn "ok" and I can see crawled data on logs. But when ı tried to pull messages from topic I could not get any data. When ı check PubSub Api metric I see 404 reponse. How shoul I write the code that publish messages to PubSub topic?
Here is the code I wrote so far:
import base64
from bs4 import BeautifulSoup
import requests
from google.cloud import pubsub_v1
def hello_pubsub(event, context):
publisher = pubsub_v1.PublisherClient()
# The `topic_path` method creates a fully qualified identifier
# in the form `projects/{project_id}/topics/{topic_id}`
topic_path = publisher.topic_path("tokyo-ring-<secret>", "webscraping")
html_text = requests.get('https://www.arabam.com/ikinci-el?take=50').text
#print(html_text)
soup = BeautifulSoup(html_text,'lxml')
models = soup.find_all('tr', class_='listing-list-item pr should-hover bg-white')
for model in models:
model_name = model.find('td', class_='listing-modelname pr').text
title = model.find('td', class_='horizontal-half-padder-minus pr').text
model_year = model.find('td', class_='listing-text pl8 pr8 tac pr').text
price = model.find('td', 'pl8 pr8 tac pr').text.replace('TL','').replace(' ','').replace('.','')
publish_date = model.find('td', class_='listing-text tac pr').text
location = model.find('div', style='display:flex;justify-content:center;align-items:center;height:81px').text.split(' ', 1)[0]
data= "{"+"\"model_name\":\""+model_name+"\""+","+"\"title\":"+"\""+title+"\",\""+"model_year\""+":\""+model_year+"\""+",\"price\":\""+price+"\""+",\"publish_date\":\""+publish_date+"\","+"\"location\":\""+location+"\"}"
#pubsub_message = base64.b64decode(event['data']).decode('utf-8')
print(data)
Upvotes: 2
Views: 174
Reputation: 2211
I do not see the publisher.publish() function in your code snippet and that's how you would publish messages to PubSub. Here is a full example showing how you can publish to pubsub:
"""Publishes multiple messages to a Pub/Sub topic with an error handler."""
from concurrent import futures
from google.cloud import pubsub_v1
# TODO(developer)
# project_id = "your-project-id"
# topic_id = "your-topic-id"
publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path(project_id, topic_id)
publish_futures = []
def get_callback(publish_future, data):
def callback(publish_future):
try:
# Wait 60 seconds for the publish call to succeed.
print(publish_future.result(timeout=60))
except futures.TimeoutError:
print(f"Publishing {data} timed out.")
return callback
for i in range(10):
data = str(i)
# When you publish a message, the client returns a future.
publish_future = publisher.publish(topic_path, data.encode("utf-8"))
# Non-blocking. Publish failures are handled in the callback function.
publish_future.add_done_callback(get_callback(publish_future, data))
publish_futures.append(publish_future)
# Wait for all the publish futures to resolve before exiting.
futures.wait(publish_futures, return_when=futures.ALL_COMPLETED)
print(f"Published messages with error handler to {topic_path}.")
Reference: https://cloud.google.com/pubsub/docs/publisher
Upvotes: 1