Python: Twitter API tweets/search: Flatten nested dictionary to columns

Python Code to Flatten Twitter Output to Flat Columnar for .csv / Tableau / SQL

def test_for_entity(root,key,entity_value):
    # test if list is entity
    parent_key = root.split(".")[-1:][0]
    if 'entities'in root.split("."):
        # Entities for tweets
        if key in ("symbols","hashtags"):
            list_items = [list_item['text'] for list_item in entity_value]
        elif key == "media":
            list_items = [list_item['media_url'] for list_item in entity_value]
        elif key == "urls":
            list_items = [list_item['expanded_url'] for list_item in entity_value]
        elif key == "user_mentions":
            list_items = [list_item['screen_name'] for list_item in entity_value]
        # Entities for users
        elif key == "url":
            list_items = [list_item['expanded_url'] for list_item in entity_value['urls']]
        elif key == "description":
            list_items = [list_item['expanded_url'] for list_item in entity_value['urls']]
            print "[ERROR: unknown entity name'"+str(key)+"']"
        list_items = [list_item for list_item in entity_value]
    return list_items

def flatten(indict, current_key=None, outerdict=None):
    if outerdict is None:
        outerdict = {}
    for key, value in indict.iteritems():
        newkey = current_key + '.' + key if current_key else key
        if type(value) is not dict:
            if type(value) is list:
                outerdict[newkey] = test_for_entity(newkey,key,value)
                outerdict[newkey] = value
            flatten(value, current_key=newkey, outerdict=outerdict)
    return outerdict

def flattened_tweet_list(tweets):
    flattens tweets into a 2D list of dictionaries (1 dict per tweet) for outputting to csv
    tweet_list = []
    for tweet in tweets:
        flat_tweet = flatten(tweet)
    return tweet_list

and in use where twitter_output is a dictionary item of the original JSON output. This could be directly from Twython or using twitter_output = json.loads(twitter_json_object)

tweets = twitter_output['statuses']
flattend_tweet_data = flatten(tweets)

below is my original question...

Original Problem

I am flattening a nested dictionary of tweet descriptors into a columnar output with separators of "__" between each level of nested dictionary items and printing to the terminal as a visual test.

My code below does this successfully but I don't know enough about Python to make it more elegant.

Extract of the flattened dictionary output in the terminal (edited a little with spaces and u'' added to make it clearer):

user__entities__url__urls          [u'']
user__entities__description__urls  []
user__profile_location             u'None'
user__url                          u''
user__profile_image_url_https      u'https://image_url.jpeg'
user__profile_sidebar_fill_color   u'EFEFEF'
user__location                     u'Los Angeles, CA'

Tweet format

I obtain the following example search response using Twython in a dictionary format from the .search() method:

response = [{u'contributors': None, u'truncated': False, u'text': u'Hate It or love It? Kim Kardashian in Balmain &amp; Alexander McQueen [Photos] via @lovebscott', u'in_reply_to_status_id': None, u'id': 537357629975064577, u'favorite_count': 0, u'source': u'<a href="" rel="nofollow">Tweetbot for Mac</a>', u'retweeted': False, u'coordinates': None, u'entities': {u'symbols': [], u'user_mentions': [{u'id': 14521926, u'indices': [106, 117], u'id_str': u'14521926', u'screen_name': u'lovebscott', u'name': u'B. Scott'}], u'hashtags': [], u'urls': [{u'url': u'', u'indices': [79, 101], u'expanded_url': u'', u'display_url': u'\u2026'}], u'media': [{u'expanded_url': u'', u'display_url': u'', u'url': u'', u'media_url_https': u'', u'id_str': u'537357629656281090', u'sizes': {u'large': {u'h': 612, u'resize': u'fit', u'w': 610}, u'small': {u'h': 341, u'resize': u'fit', u'w': 340}, u'medium': {u'h': 601, u'resize': u'fit', u'w': 600}, u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}}, u'indices': [118, 140], u'type': u'photo', u'id': 537357629656281090, u'media_url': u''}]}, u'in_reply_to_screen_name': None, u'in_reply_to_user_id': None, u'retweet_count': 0, u'id_str': u'537357629975064577', u'favorited': False, u'user': {u'follow_request_sent': None, u'profile_use_background_image': False, u'profile_text_color': u'333333', u'default_profile_image': False, u'id': 14521926, u'profile_background_image_url_https': u'', u'verified': True, u'profile_location': None, u'profile_image_url_https': u'', u'profile_sidebar_fill_color': u'EFEFEF', u'entities': {u'url': {u'urls': [{u'url': u'', u'indices': [0, 22], u'expanded_url': u'', u'display_url': u''}]}, u'description': {u'urls': []}}, u'followers_count': 161968, u'profile_sidebar_border_color': u'FFFFFF', u'id_str': u'14521926', u'profile_background_color': u'131516', u'listed_count': 1905, u'is_translation_enabled': False, u'utc_offset': -18000, u'statuses_count': 58304, u'description': u'#KingofFabulous - #TheMultimediaMaven - Mogul - TV / Internet Personality - @EBONYMag Advice Columnist - @glam_com Contributing Editor', u'friends_count': 373, u'location': u'Los Angeles, CA', u'profile_link_color': u'009999', u'profile_image_url': u'', u'following': None, u'geo_enabled': True, u'profile_banner_url': u'', u'profile_background_image_url': u'', u'name': u'B. Scott', u'lang': u'en', u'profile_background_tile': True, u'favourites_count': 14, u'screen_name': u'lovebscott', u'notifications': None, u'url': u'', u'created_at': u'Fri Apr 25 03:29:42 +0000 2008', u'contributors_enabled': False, u'time_zone': u'Quito', u'protected': False, u'default_profile': False, u'is_translator': False}, u'geo': None, u'in_reply_to_user_id_str': None, u'possibly_sensitive': True, u'lang': u'en', u'created_at': u'Tue Nov 25 21:30:17 +0000 2014', u'in_reply_to_status_id_str': None, u'place': None, u'metadata': {u'iso_language_code': u'en', u'result_type': u'recent'}}]

The general dictionary format of a Twython search response is as follows:



  1. Cycle through keys in dictionary printing keys,values until values is a nested structure
  2. Determine what nested structure values is:
    1. if list: run test_for_entity()
    2. if dictionary, go to 1.
    3. if else output whole structure

It is the iteration of 2.1,2.2 and 2.3 that I think could be a lot cleaner but I don't know how to do it :(


Note: The function test_for_entity() handles the nested structure of entities as not all the information is relevant. It is used whenever the next nested structure is a list rather than a dictionary.

def test_for_entity(root,key,entity_value):
    # test if list is entity
    parent_key = root.split("__")[-1:][0]
    if 'entities'in root.split("__"):
        # Entities for tweets
        if key in ("symbols","hashtags"):
            list_items = [list_item['text'] for list_item in entity_value]
            print root+"__"+key,list_items
        elif key == "media":
            list_items = [[list_item['type'],list_item['media_url']] for list_item in entity_value]
            print root+"__"+key,list_items
        elif key == "urls":
            list_items = [list_item['expanded_url'] for list_item in entity_value]
            print root+"__"+key,list_items
        elif key == "user_mentions":
            list_items = [list_item['screen_name'] for list_item in entity_value]
            print root+"__"+key,list_items
        # Entities for users
        elif key == "url":
            list_items = [list_item['expanded_url'] for list_item in entity_value['urls']]
            print root+"__"+key,list_items
        elif key == "description":
            list_items = [list_item['expanded_url'] for list_item in entity_value['urls']]
            print root+"__"+key,list_items
            print "[ERROR: unknown entity name'"+str(key)+"']","list",parent_key+"__"+key,list_items
        list_items = [list_item for list_item in entity_value]
        print root+"__"+key,list_items,parent_key

for tweet in response:
    for key_0,value_0 in tweet.items():
        if type(value_0) is dict: 
            for key_1,value_1 in value_0.items():
                if type(value_1) is dict: 
                    for key_2,value_2 in value_1.items():
                        if type(value_2) is dict: 
                            for key_3,value_3 in value_2.items():
                                if type(value_3) is dict:
                                    # Limit of recursive unpacking...
                                    print key_0+"__"+key_1+"__"+key_2+"__"+key_3,value_3
                                elif type(value_3) is list:
                                    test_for_entity(root = key_0+"__"+key_1+"__"+key_2,key = key_3,entity_value=value_3)
                                    print key_0+"__"+key_1+"__"+key_2+"__"+key_3,value_3
                        elif type(value_2) is list:
                            test_for_entity(root = key_0+"__"+key_1,key = key_2,entity_value=value_2)
                            print key_0+"__"+key_1+"__"+key_2,value_2
                elif type(value_1) is list:
                    test_for_entity(root=key_0,key = key_1,entity_value=value_1)
                    print key_0+"__"+key_1,value_1
        elif type(value_0) is list:
            test_for_entity(root="",key = key_0,entity_value=value_0)
            print key_0,value_0

You didn't make it quite clear what you wanted to do when you encounter an array, so leaving that alone:

Given a variable user as the subset of your Twitter response you selected:

user = {u'user': {u'lang': u'en', u'utc_offset': -18000, u'statuses_count': 58304, u'default_profile_image': False, u'friends_count': 373, u'profile_background_image_url_https': u'', u'profile_use_background_image': False, u'profile_sidebar_fill_color': u'EFEFEF', u'profile_link_color': u'009999', u'profile_image_url': u'', u'time_zone': u'Quito', u'is_translator': False, u'screen_name': u'lovebscott', u'url': u'', u'verified': True, u'geo_enabled': True, u'profile_background_color': u'131516', u'profile_banner_url': u'', u'id': 14521926, u'profile_background_image_url': u'', u'description': u'#KingofFabulous - #TheMultimediaMaven - Mogul - TV / Internet Personality - @EBONYMag Advice Columnist - @glam_com Contributing Editor', u'is_translation_enabled': False, u'profile_background_tile': True, u'favourites_count': 14, u'name': u'B. Scott', u'notifications': None, u'follow_request_sent': None, u'profile_text_color': u'333333', u'created_at': u'Fri Apr 25 03:29:42 +0000 2008', u'profile_location': None, u'contributors_enabled': False, u'location': u'Los Angeles, CA', u'entities': {u'url': {u'urls': [{u'indices': [0, 22], u'url': u'', u'expanded_url': u'', u'display_url': u''}]}, u'description': {u'urls': []}}, u'followers_count': 161968, u'profile_sidebar_border_color': u'FFFFFF', u'id_str': u'14521926', u'default_profile': False, u'following': None, u'protected': False, u'profile_image_url_https': u'', u'listed_count': 1905}}

You can write a recursive function to go arbitrarily deep inside the dictionary concatenating keys until it runs across an item that isn't a dictionary that it will consider the 'final value' for that node of the tree.

def process(indict, current_key=None, outerdict=None):
    if outerdict is None:
        outerdict = {}
    for key, value in indict.iteritems():
        newkey = current_key + '__' + key if current_key else key
        if type(value) is not dict:
            outerdict[newkey] = value
            process(value, current_key=newkey, outerdict=outerdict)
    return outerdict

With the result being:

>>> pprint.pprint(process(user))
{u'user__contributors_enabled': False,
 u'user__created_at': u'Fri Apr 25 03:29:42 +0000 2008',
 u'user__default_profile': False,
 u'user__default_profile_image': False,
 u'user__description': u'#KingofFabulous - #TheMultimediaMaven - Mogul - TV / Internet Personality - @EBONYMag Advice Columnist - @glam_com Contributing Editor',
 u'user__entities__description__urls': [],
 u'user__entities__url__urls': [{u'display_url': u'',
                                 u'expanded_url': u'',
                                 u'indices': [0, 22],
                                 u'url': u''}],
 u'user__favourites_count': 14,
 u'user__follow_request_sent': None,
 u'user__followers_count': 161968,
 u'user__following': None,
 u'user__friends_count': 373,
 u'user__geo_enabled': True,
 u'user__id': 14521926,
 u'user__id_str': u'14521926',
 u'user__is_translation_enabled': False,
 u'user__is_translator': False,
 u'user__lang': u'en',
 u'user__listed_count': 1905,
 u'user__location': u'Los Angeles, CA',
 u'user__name': u'B. Scott',
 u'user__notifications': None,
 u'user__profile_background_color': u'131516',
 u'user__profile_background_image_url': u'',
 u'user__profile_background_image_url_https': u'',
 u'user__profile_background_tile': True,
 u'user__profile_banner_url': u'',
 u'user__profile_image_url': u'',
 u'user__profile_image_url_https': u'',
 u'user__profile_link_color': u'009999',
 u'user__profile_location': None,
 u'user__profile_sidebar_border_color': u'FFFFFF',
 u'user__profile_sidebar_fill_color': u'EFEFEF',
 u'user__profile_text_color': u'333333',
 u'user__profile_use_background_image': False,
 u'user__protected': False,
 u'user__screen_name': u'lovebscott',
 u'user__statuses_count': 58304,
 u'user__time_zone': u'Quito',
 u'user__url': u'',
 u'user__utc_offset': -18000,
 u'user__verified': True}

Upvotes: 2

