Reputation: 2234
I have a list of lists
[["Due to the storms this weekend, we have rescheduled the Blumenfield Bike Ride for Feb 26. Hope to see you there.\xe2\x80\xa6 '"], ['Lots of sun this weekend, take advantage of the Beach Bus that gets you from Woodland Hills to the beach for just $\xe2\x80\xa6 '], ["RT @LHansenLA: Yesterday got a peek inside @LAPPL @EagleandBadge new rig for End of Watch Memorial Wall. Moving tribute to fallen @LAPD w/\xe2\x80\xa6'"], ["Happy to join Art Sherman and Wings Over @Wendys to honor veterans & 15 years of weekly meetings hosted by Ron and\xe2\x80\xa6 '"], ["Join me for the 4th Annual Blumenfield Bike Ride. Enjoy the West Valley on 2 wheels. RSVP:'"]]
As you can see, the lists unfortunately are displaying literal UTF-8 instead of the characters themselves. At some point in my code, I encode into UTF-8
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
outtweets = [[stuff.replace("b\'", "")] for sublist in outtweets for stuff in sublist]
outtweets = [[stuff.replace('b\"', "")] for sublist in outtweets for stuff in sublist]
The above code is all necessary in order to remove the b prefixes. These cannot be in my tweets because I am doing machine learning analysis and having the bs affects it.
How do I replace the UTF-8 script with the actual characters?
I need to encode it somehow because I am pulling tweets from (3 cities) x (50 officials) x (12 months of tweets for each) so it would be impossibly inefficient to try to manually replace them.
import tweepy #https://github.com/tweepy/tweepy
#Twitter API credentials
consumer_key = "insert key here"
consumer_secret = "insert key here"
access_key = "insert key here"
access_secret = "insert key here"
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#!/usr/bin/env python
# encoding: utf-8
import tweepy #https://github.com/tweepy/tweepy
import json
import csv
import datetime
from datetime import datetime
import os.path
failed_accounts = []
def get_all_tweets(screen_name,mode):
#try:
#Twitter only allows access to a users most recent 3240 tweets with this method
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
i = 0
num_req = 0
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print ("...%s tweets downloaded so far" % (len(alltweets)))
num_req = num_req + 1
# makes further requests only if batch doesn't contain tweets beyond oldest limit
oldest_limit = datetime(2016, 1, 20,0,0,0)
x = 0
for tweet in new_tweets:
raw_date = tweet.created_at
if raw_date < oldest_limit:
x = 1
else:
continue
if x == 1:
break
#BSP this script is designed to just keep going. I want it to stop.
#i = i + 1
#if i == 10:
# break
print("Number of Tweet Request Rounds: %s" %num_req)
correct_date_tweet = []
for tweet in alltweets:
raw_date = tweet.created_at
#date = datetime.strptime(raw_date, "%Y-%m-%d %H:%M:%S")
newest_limit = datetime(2017, 1, 20,0,0,0)
oldest_limit = datetime(2016, 1, 20,0,0,0)
if raw_date > oldest_limit and raw_date < newest_limit:
correct_date_tweet.append(tweet)
else:
continue
#transform the tweepy tweets into a 2D array that will populate the csv
if mode == "tweets only" or "instance file":
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
outtweets = [[stuff.replace("b\'", "")] for sublist in outtweets for stuff in sublist]
outtweets = [[stuff.replace('b\"', "")] for sublist in outtweets for stuff in sublist]
outtweets = [["1 ",stuff.replace('"', "")] for sublist in outtweets for stuff in sublist]
#outtweets = [["1 ",stuff] for sublist in outtweets for stuff in sublist]
else:
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"),tweet.retweet_count,tweet.favorite_count,len(tweet.entities.get("hashtags")),len(tweet.entities.get("urls")),len(tweet.entities.get("user_mentions"))] for tweet in correct_date_tweet]
#write the csv
if mode == "instance file":
with open(os.path.join(save_location,'%s.instance' % screen_name), mode ='w') as f:
writer = csv.writer(f)
writer.writerows(outtweets)
else:
with open(os.path.join(save_location,'%s.csv' % screen_name), 'w',encoding='utf-8') as f:
writer = csv.writer(f)
if mode != "tweets only":
writer.writerow(["id","created_at","text","retweets","favorites","hashtags","urls"])
writer.writerows(outtweets)
pass
print("Done with %s" % screen_name)
get_all_tweets("BobBlumenfield","instance file")
Based on an answer, I tried changing one of the lines to outtweets = [[tweet.text] for tweet in correct_date_tweet]
But this didn't work because it yields
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-12-a864b5efe8af> in <module>()
----> 1 get_all_tweets("BobBlumenfield","instance file")
<ipython-input-9-d0b9b37c7261> in get_all_tweets(screen_name, mode)
104 with open(os.path.join(save_location,'%s.instance' % screen_name), mode ='w') as f:
105 writer = csv.writer(f)
--> 106 writer.writerows(outtweets)
107 else:
108 with open(os.path.join(save_location,'%s.csv' % screen_name), 'w',encoding='utf-8') as f:
C:\Users\Stan Shunpike\Anaconda3\lib\encodings\cp1252.py in encode(self, input, final)
17 class IncrementalEncoder(codecs.IncrementalEncoder):
18 def encode(self, input, final=False):
---> 19 return codecs.charmap_encode(input,self.errors,encoding_table)[0]
20
21 class IncrementalDecoder(codecs.IncrementalDecoder):
UnicodeEncodeError: 'charmap' codec can't encode characters in position 64-65: character maps to <undefined>
Upvotes: 1
Views: 2644
Reputation: 27704
Remove the following line:
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
Here's why:
b
.str
without an encoding defined. In this mode you're getting a representation of the array, including types, again hence the b
and the UTF-8 escaping..encode()
yourself if you use open()
's built-in encoder. When using open()
in text mode as you are doing, always specify the encoding as it's different per platform.
Remove all other uses of .encode()
from your code.
You can now remove the other lines that are trying to correct your error.
Upvotes: 2