shan
shan

Reputation: 583

Scraping various features in twitter data

I am trying to extract twitter data but facing an error. I am extracting thge following features using tweepy

'retweeted_status','hashtags', 'text', 'urls', 'user_mentions', 'screen_name', 'id', 'created_at', 'country' , 'state', 'place', 'hashtag_count', 'url_count', 'mention_count','possibly_sensitive','favorite_count', 'favorited', 'retweet_count', 'retweeted', user.statuses_count, user.favourites_count, user.followers_count, user_description',  user_'location', user_'time_zone'

It will be helpful if I can get help debugging the error of the following or alternatives in python to extract the above features

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import csv


from collections import Counter
import ast

import tweepy
import json
from tweepy import OAuthHandler

consumer_key =    'xxxxxxxxx'       
consumer_secret = 'xxxxxxxxx'       
access_key=       'xxxxxxxxx'         
access_secret =   'xxxxxxxxx' 

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)

api = tweepy.API(auth)

from tweepy import Stream
#from tweepy.streaming import StreamListener



# get retweet status
   def try_retweet(status, attribute):
    try:
        if getattr(status, attribute):
            return True
    except AttributeError:
        return None

   # get country status
   def try_country(status, attribute):
    if getattr(status, attribute) != None:
        place = getattr(status, attribute)
        return place.country
    return None

# get city status
def try_city(status, attribute):
    if getattr(status, attribute) != None:
        place = getattr(status, attribute)
        return place.full_name
    return None

# function that tries to get attribute from object
def try_get(status, attribute):
    try:
        return getattr(status, attribute).encode('utf-8')
    except AttributeError:
        return None

# open csv file
csvFile = open('originalsample.csv', 'a')

# create csv writer
csvWriter = csv.writer(csvFile)

class MyListener(Stream):

    def on_status(self, status):
        try:
            # if this represents a retweet
            if try_retweet(status,'retweeted_status'):
                status = status.retweeted_status

                # get and sanitize hashtags 
                hashtags = status.entities['hashtags']
                hashtag_list = []
                for el in hashtags:
                    hashtag_list.append(el['text'])
                hashtag_count = len(hashtag_list)

                # get and sanitize urls
                urls = status.entities['urls']
                url_list = []
                for el in urls:
                    url_list.append(el['url'])
                url_count = len(url_list)

                # get and sanitize user_mentions
                user_mentions = status.entities['user_mentions']
                mention_list = []
                for el in user_mentions:
                    mention_list.append(el['screen_name'])
                mention_count = len(mention_list)

                # save it all as a tweet
                tweet = [status.id, status.created_at, try_country(status, 'place'), try_city(status, 'place'), status.text.encode('utf-8'), status.lang,
                  hashtag_list, url_list, mention_list, 
                  hashtag_count, url_count, mention_count, 
                  try_get(status, 'possibly_sensitive'),
                  status.favorite_count, status.favorited, status.retweet_count, status.retweeted, 
                  status.user.statuses_count, 
                  status.user.favourites_count, 
                  status.user.followers_count,
                  try_get(status.user, 'description'),
                  try_get(status.user, 'location'),
                  try_get(status.user, 'time_zone')]

                # write to csv
                csvWriter.writerow(tweet)
        except BaseException as e:
            print("Error on_data: %s" % str(e))
        return True

    # tell us if there's an error
    def on_request_error(self, status):
        print(status)
        return True

twitter_stream = Stream(auth, MyListener())
twitter_stream.sample()

The output is suppose to be in the following format:

                      id    created_at       country    city    text                                   lang   hashtags  urls user_mentions  hashtag_count   url_count   mention_count   possibly_sensitive  favorite_count  favorited   retweet_count   retweeted   user_statuses_count user_favorites_count    user_follower_count user_description    user_location   user_timezone   
     0  669227044996124673  2015-11-24 18:52:15  NaN     NaN     Yo 💁🏼💟👌🏼 '               '    und []  []  []                0          0             0                NaN                      270      False               288   False                   10726                 18927          24429                      NaN                 Yucatán, México Mexico City

Its showing following error:

    ---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-8-c016fb9faa9c> in <module>
     92         return True
     93 
---> 94 twitter_stream = Stream(auth, MyListener())
     95 twitter_stream.sample()

TypeError: __init__() missing 4 required positional arguments: 'consumer_key', 'consumer_secret', 'access_token', and 'access_token_secret'

Upvotes: 0

Views: 119

Answers (1)

D Malan
D Malan

Reputation: 11464

StreamListener was merged into Stream in Tweepy v4.0.0 (see the docs for "Where did StreamListener go?).

You now need to subclass Stream and on_error changed to on_request_error.

Upvotes: 1

Related Questions