Djibril Diakhate
Djibril Diakhate

Reputation: 1

Date Conversion Error When Using Tweepy to Retrieve Recent Tweets

I am currently facing this problem since this morning. I've tried several things, but the result remains the same, I always get the same error:

time data '2024-05-03T17:13:39.000Z' does not match format '%Y-%m-%dT%H:%M:%S.%f%z'
---------------------------------------------------------------------------ValueError                                Traceback (most recent call last)<ipython-input-11-988e88e9cba4> in <module>
     48 at=dt2.strftime("%m/%d/%Y, %H:%M:%S %Z")
     49 
---> 50 paginator = client.search_recent_tweets(query=query, tweet_fields=["lang", "context_annotations", "public_metrics", "created_at", "id", "text", "author_id", "entities", "geo"], max_results=maxtweets_persearch)
     51 
     52 for page in paginator.flatten():
~/cluster-env/env/lib/python3.6/site-packages/tweepy/client.py in search_recent_tweets(self, query, user_auth, **params)
    655                 "since_id", "start_time", "tweet.fields", "until_id",
    656                 "user.fields"
--> 657             ), data_type=Tweet, user_auth=user_auth
    658         )
    659 
~/cluster-env/env/lib/python3.6/site-packages/tweepy/client.py in _make_request(self, method, route, params, endpoint_parameters, json, data_type, user_auth)
    174         if data_type is not None:
    175             if isinstance(data, list):
--> 176                 data = [data_type(result) for result in data]
    177             elif data is not None:
    178                 data = data_type(data)
~/cluster-env/env/lib/python3.6/site-packages/tweepy/client.py in <listcomp>(.0)
    174         if data_type is not None:
    175             if isinstance(data, list):
--> 176                 data = [data_type(result) for result in data]
    177             elif data is not None:
    178                 data = data_type(data)
~/cluster-env/env/lib/python3.6/site-packages/tweepy/tweet.py in __init__(self, data)
     39         if self.created_at is not None:
     40             self.created_at = datetime.datetime.strptime(
---> 41                 self.created_at, "%Y-%m-%dT%H:%M:%S.%f%z"
     42             )
     43 
~/cluster-env/env/lib/python3.6/_strptime.py in _strptime_datetime(cls, data_string, format)
    563     """Return a class cls instance based on the input string and the
    564     format string."""
--> 565     tt, fraction = _strptime(data_string, format)
    566     tzname, gmtoff = tt[-2:]
    567     args = tt[:6] + (fraction,)
~/cluster-env/env/lib/python3.6/_strptime.py in _strptime(data_string, format)
    360     if not found:
    361         raise ValueError("time data %r does not match format %r" %
--> 362                          (data_string, format))
    363     if len(data_string) != found.end():
    364         raise ValueError("unconverted data remains: %s" %
ValueError: time data '2024-05-03T17:13:39.000Z' does not match format '%Y-%m-%dT%H:%M:%S.%f%z' '

The code generating this error is here:

def convert_date(date_str):
    try:
        # Prioriser les formats avec millisecondes 
        formats = [
            "%Y-%m-%dT%H:%M:%S.%fZ",  # Avec millisecondes et fuseau horaire
            "%Y-%m-%dT%H:%M:%S.%f",   # Avec millisecondes sans fuseau horaire
            "%Y-%m-%dT%H:%M:%S%z"      # Format d'origine (peut gérer les fuseaux horaires)
        ]  
        for fmt in formats:
            try:
                return dateutil.parser.parse(date_str, fuzzy=True)  
            except (ValueError, TypeError):
                pass  # Passer au format suivant en cas d'erreur
        
        # Si aucun format ne correspond
        print(f"Format de date inattendu : {date_str}")
        return None

    except (ValueError, TypeError) as e:
        print(f"Erreur de conversion de date : {date_str}, erreur : {e}")
        return None

end_date = datetime.utcnow() - timedelta(days=max_days)
all_tweets = []
all_users = []
all_hashtags = []
all_urls = []
all_media = []
all_handles = []
count = 0
query_search = True

dt2 = datetime.now()
ts= int(time.mktime(dt2.timetuple()))
at=dt2.strftime("%m/%d/%Y, %H:%M:%S %Z")

paginator = client.search_recent_tweets(query=query, tweet_fields=["lang", "context_annotations", "public_metrics", "created_at", "id", "text", "author_id", "entities", "geo"], max_results=maxtweets_persearch)

for page in paginator.flatten():
    for status in page.data:

        tweet = {}
        user = {}
        tweet_obj = status

        if (last_created_at != None and last_created_at != '' and parse(tweet_obj["created_at"]).timestamp() <= last_created_at.timestamp()):
            continue

        user_obj = tweet_obj["author"]
        user["topickey"] = topic
        user["id"] = user_obj["id"]
        user["document_type"] = "user"
        user['inserted_at'] = at
        user['inserted_ts'] = ts
        user['name'] = user_obj["name"]
        user['screen_name'] = user_obj["username"]
        user['description'] = user_obj["description"]
        user['protected'] = user_obj["protected"]
        user['followers_count'] = user_obj["public_metrics"]["followers_count"]
        user['friends_count'] = user_obj["public_metrics"]["following_count"]
        user['listed_count'] = user_obj["public_metrics"]["listed_count"]
        user['profile_image_url'] = user_obj["profile_image_url"]
        user['verified'] = user_obj["verified"]
        user['created_at'] = convert_date(user_obj["created_at"])
        user["month_year"] = str(str(user['created_at'].month) + "_"+str(user['created_at'].year))

        user["country_azuremaps"] = ''
        user["country_code_azuremaps"] = ''
        
        user_location = tweet_obj["user"]["location"]
        if user_location != "" and user_location not in regions:
            r_json = get_maps_response(user_location)
            if r_json: # i.e. got a response
                if r_json["summary"]["numResults"] > 0:
                    # there is a location detected, so get the country
                    if "address" in r_json['results'][0].keys():
                        top_match = r_json['results'][0]["address"]
                        if "country" in top_match.keys() and "countryCode" in top_match.keys() :
                            country = top_match["country"]
                            country_code = top_match["countryCode"]
                            user["country_azuremaps"] = country
                            user["country_code_azuremaps"] = country_code

        id_str = tweet_obj["id_str"]

        if tweet_obj['entities'] is not None:
            for key, value in tweet_obj['entities'].items():
                if key == 'hashtags':
                    for h in value:
                        hashtag = {}
                        hashtag['id'] = id_str
                        hashtag['text'] = h['text']
                        hashtag['created_datetime'] = datetime.now()
                        all_hashtags.append(hashtag)

        for um in tweet_obj['entities']['user_mentions']:
            user_mention = {}
            user_mention['id'] = id_str
            user_mention['screen_name'] = um['screen_name']
            user_mention['created_datetime'] = datetime.now()
            all_handles.append(um)
        for u in tweet_obj['entities']['urls']:
            urls = {}
            urls['id'] = id_str
            urls['url'] = u['url']
            urls['expanded_url'] = u['expanded_url']
            urls['display_url'] = u['display_url']
            urls['created_datetime'] = datetime.now()
            all_urls.append(urls)
        if 'media' in tweet_obj['entities']:
            for m in tweet_obj['entities']['media']:
                media = {}
                media['id'] = id_str
                media['media_url'] = m['media_url']
                media['created_datetime'] = datetime.now()
                all_media.append(media)

        tweet["userid"]=user["id"]
        dt2 = datetime.now()
        ts = int(time.mktime(dt2.timetuple()))
        at = dt2.strftime("%m/%d/%Y, %H:%M:%S %Z")
        tweet['inserted_at'] = at
        tweet['inserted_ts'] = ts
        tweet["originalid"] = tweet_obj["id"]
        tweet["id"] = str(int(tweet_obj["id_str"])+abs(hash(topic))) # artifically creating our own ID
        tweet["topickey"] = topic
        tweet["subtopic"] = subtopic
        tweet["created_at"] = convert_date(tweet_obj["created_at"])
        tweet["created_date"] = tweet["created_at"].date()
        tweet["month_year"] = str(str(tweet["created_at"].month) + "_"+str(tweet["created_at"].year))

        tmp_text = tweet_obj["full_text"].replace('\n','. ').replace('\r','.').replace('..','. ').replace(',.','. ').replace(';.','. ').replace('?.','. ').replace('!.','. ').replace(':.','. ').lstrip('.').lstrip(' ')
        tmp_text = remove_emojis(tmp_text)
        tweet["text"]= tmp_text
        tweet["document_type"] = "tweet"

        tweet["search_type"]='Topic Search'  
        tweet["query"] = str(query)

        tweet["is_quote_status"] = tweet_obj["is_quote_status"]
        tweet["retweet_count"] = tweet_obj["retweet_count"]
        tweet["favorite_count"] = tweet_obj["favorite_count"]
        tweet["favorited"] = tweet_obj["favorited"]
        tweet["retweeted"] = tweet_obj["retweeted"]
        tweet["lang"] = tweet_obj["lang"]
        tweet["source"] = tweet_obj["source"]

        city = 'NA'
        country = 'NA'  
        if tweet_obj['place'] is None:
            city = 'NA'
            country = 'NA'  
        else:
            city = tweet_obj["place"]['name']
            country = tweet_obj["place"]['country']

        tweet['city'] = city
        tweet['country'] = country

        all_tweets.append(tweet)
        all_users.append(user)
        count +=1
        if count > num_tweets:
            break

I've tried several date conversion methods, but it always gives the same result as if the code isn't taking my updates into account. Can anyone help me solve this problem, please?

Upvotes: 0

Views: 32

Answers (0)

Related Questions