Reputation: 89
I have a script that collects Reddit comments. It pulls from a csv file with a list of links in it. Some of the links are dead and I get 404/403/etc errors. The code below will correctly identify them and skip, but it then exits the loop and completes the process of making the csv file without continuing onto the next link.
import praw
import pprint
import csv
import os
import pandas as pd
from collections import namedtuple
from datetime import datetime
from pathlib import Path
def scrape_comments(reddit_api, csv_file, dest):
df = pd.read_csv(csv_file)
data = []
try:
for pid in df.id:
# post_comment = []
submission = reddit_api.submission(id=pid)
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
# post_comment.append(comment.body)
data.append((pid, comment.id, comment.parent_id, comment.body, comment.link_id,comment.author, comment.score, comment.created_utc, comment.subreddit))
# data.append((pid, ";".join(post_comment)))
except:
print ("Error! Skip the Current subreddit")
df = pd.DataFrame(data, columns=["post_id", "comment_id", "comment_parent_id", "comment_body", "comment_link_id","comment_author", "comment_score","comment_created","comment_subreddit"]) # append tuple
df.to_csv(dest, index=False, encoding='utf-8')
if __name__ == "__main__":
reddit_api = praw.Reddit(
client_id="####",
client_secret="####",
user_agent="####",
username="####",
password="####"
)
# reddit_api = init_praw(client_id, client_secret, user_agent, username, password)
csv_file = "####"
dest_dir = "####"
dest_name = "reddits_comments.csv"
Path(dest_dir).mkdir(parents=True, exist_ok=True)
dest = os.path.join(dest_dir, dest_name)
scrape_comments(reddit_api, csv_file, dest)
Upvotes: 0
Views: 55
Reputation: 23079
You should put the try/except
around a smaller portion of your code, as said in the comments. Here's an illustration of that:
def scrape_comments(reddit_api, csv_file, dest):
df = pd.read_csv(csv_file)
data = []
for pid in df.id:
try:
# post_comment = []
submission = reddit_api.submission(id=pid)
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
# post_comment.append(comment.body)
data.append((pid, comment.id, comment.parent_id, comment.body, comment.link_id,comment.author, comment.score, comment.created_utc, comment.subreddit))
# data.append((pid, ";".join(post_comment)))
except Exception:
print ("Error! Skip the Current subreddit")
df = pd.DataFrame(data, columns=["post_id", "comment_id", "comment_parent_id", "comment_body", "comment_link_id","comment_author", "comment_score","comment_created","comment_subreddit"]) # append tuple
df.to_csv(dest, index=False, encoding='utf-8')
Upvotes: 2