wizkids121
wizkids121

Reputation: 656

Excluding a previous jobs results to get around API rate limits

I have a Python script that is pulling data from the Instagram API. It does the following:

  1. Pulls a list of an Instagram users followers
  2. Goes through that list of followers and pull all of their photo data

The script looks like this (to make reproducible, just enter your insert your Instagram login information):

from ftplib import error_proto
from time import sleep
from instagram_private_api import Client, ClientCompatPatch
from operator import itemgetter
import pandas as pd
import json
import requests
from collections import Counter
import datetime

user_name = "XXXXX"
password = "XXXXXXX"

api = Client(user_name, password)

players = [['carter.bins', '14304399497']]
player_df = pd.DataFrame(players, columns=['username', 'userId'])
print(player_df)

def pull_followers(username_instagram, userid_instagram):
    followers = []
    combinacao = []
    results = api.user_followers(userid_instagram, rank_token=api.generate_uuid())
    followers.extend(results.get('users', []))

    next_max_id = results.get('next_max_id')
    
    while next_max_id:
        results = api.user_followers(userid_instagram, rank_token=api.generate_uuid(), max_id=next_max_id)
        followers.extend(results.get('users', []))
        next_max_id = results.get('next_max_id')

    userid = [followers[i]['pk'] for i in range(0,len(followers))]
    
    full_names = [followers[i]['full_name'] for i in range(0,len(followers))]
    
    usernames = [followers[i]['username'] for i in range(0,len(followers))]
    
    is_private = [followers[i]['is_private'] for i in range(0,len(followers))]
    
    profile_pic_url = [followers[i]['profile_pic_url'] for i in range(0,len(followers))]
    
    followers_text = ['follower' for i in range(0,len(followers))]

    following_username = [str(username_instagram) for i in range(0,len(followers))]
    
    following_userid = [str(userid_instagram) for i in range(0,len(followers))]
    
    combinacao.extend([list(i) for i in zip(userid, full_names,
                                            usernames, is_private, profile_pic_url, followers_text,
                                            following_username, following_userid)])
    combinacao = sorted(combinacao, key=itemgetter(2), reverse=False)
    return combinacao
 
# Creating dataframe
df = pd.DataFrame()
# Extract photos information
def pull_photo_information(username_insta):
    # Initialize the lists
    likes=[]
    comments_count=[]
    url=[]
    teste=[]
    data_foto=[]
    latitudelista = []
    longitudelista = []
    locationlista = []
    caption_photo=[]
    curtidores_username=[]
    curtidores_fullname=[]
    username_lista=[]
    # Extract all photos information (while for pagination)
    request = api.username_feed(username_insta)
    #print(request, "@@@@@")

    teste.extend(request.get('items'))
    next_max_id = request.get('next_max_id')
    while next_max_id:
        request = api.username_feed(username_insta, max_id=next_max_id)
        next_max_id = request.get('next_max_id')
        teste.extend(request.get('items'))
    # Número de likes em todas as fotos
    for i in range(0,len(teste)):
        # username
        username_lista.append(username_insta)
        # Date
        if 'taken_at' in teste[i]:
            data_foto.append(datetime.datetime.utcfromtimestamp(teste[i]['taken_at']).strftime('%Y-%m-%d %H:%M:%S'))
        else:
            data_foto.append('-')
        # Caption text photo
        if ('caption' in teste[i]) and (not teste[i]['caption'] is None):
            titulo_foto=str(teste[i]['caption']['text'])
            caption_photo.append(titulo_foto)
        else:
            caption_photo.append('-')
        # Number of likes
        if 'like_count' in teste[i]:
            likes.append(teste[i]['like_count'])
        else:
            likes.append('-')
        # Comments count
        if 'comment_count' in teste[i]:
            comments_count.append(teste[i]['comment_count'])
        else:
            comments_count.append('-')
        # Latitude, longitude, location
        if 'lat' in teste[i]:
            latitudelista.append(teste[i]['lat'])
        else:
            latitudelista.append('-')
        if 'lng' in teste[i]:
            longitudelista.append(teste[i]['lng'])
        else:
            longitudelista.append('-')
        if 'location' in teste[i]:
            locationlista.append(teste[i]['location']['city'])
        else:
            locationlista.append('-')
        # URL
        if 'carousel_media' not in teste[i]:
            url.append(teste[i]['image_versions2']['candidates'][0]['url'])
        else:
            url.append(teste[i]['carousel_media'][0]['image_versions2']['candidates'][0]['url'])
    combinacao21=[]
    combinacao21.extend([list(i) for i in zip(data_foto, username_lista,
                                        likes, comments_count, caption_photo, locationlista, latitudelista, longitudelista, url)])
    combinacao21 = sorted(combinacao21, key=itemgetter(1), reverse=False)
    return(combinacao21)

def pull_photo_information_user(followers_df):
    list_user_by_user = []
    error_to_fetch_user_photo = []
    total_user =set()
    cnt, cnt2, cnt3 = 0, 0, 0
    size = len(followers_df)
    # for i in range(500):
    for i in range(size):
        # print("user "+str(i+1)+ " of " +str(size))
        try:
            new_list = pull_photo_information(followers_df["username"][i])
            # new_list = pull_photo_information(followers_df[i])
            # print(len(new_list), len(new_list[0]))
            cnt += 1
            list_user_by_user.append(new_list)
            total_user.add(followers_df["username"][i])
        except Exception as e:
            a = str(e)
            if a =="Please wait a few minutes before you try again.":
                print("break after "+ str(i) +" entry -> Error" + a)
                break
            else:
                error_to_fetch_user_photo.append(followers_df["username"][i])
                cnt3 +=1

    print(len(list_user_by_user),"error", len(error_to_fetch_user_photo))
    print(len(total_user), total_user)
    print(cnt, cnt3)
    return list_user_by_user, error_to_fetch_user_photo
    # return new_list

#followers_df = pd.read_csv("Output_carter.bins.csv")

# for i in range(1):
for i in range(len(player_df)):
    # Fetch followers data for each user gievn in input one by one
    # After calling the pull_followers function we will have list with all the data
    get_followers = pull_followers(player_df['username'][i], player_df["userId"][i])    
    # So, we will create new dataframe and add this data
    followers_df = pd.DataFrame(get_followers, columns = ['userID' , 'Full Name', 'username', 'is_private', 'Profile Picture', 'Type', 'following_username', 'following_userid'])
    print("User"+str(i)+ " Dataframe created")
    #Now, save it to the csv file
    followers_df.to_csv("Output_"+player_df['username'][i]+".csv", index = False)
    sleep(180)

    followers_photo_df, error = pull_photo_information_user(followers_df)

    photos_user = pd.DataFrame(columns = ['Date' , 'Username', 'Likes', 'Comments', 'Title Photo', 'Location', 'Latitude', 'Longitude', 'URL'])
    for users in range(len(followers_photo_df)):
        photos_user = photos_user.append(pd.DataFrame(followers_photo_df[users], columns = ['Date' , 'Username', 'Likes', 'Comments', 'Title Photo', 'Location', 'Latitude', 'Longitude', 'URL']),ignore_index = True)

    photos_user['Date'] = pd.to_datetime(photos_user['Date'])
    print("Fetched User"+str(i)+ " Followers Photos information")
    photos_user.to_csv("Photos_"+player_df["username"][i] +".csv", index = False)

    error_df = pd.DataFrame(error, columns=["error_user"])
    error_df.to_csv("Error_List_"+player_df["username"][i] +".csv", index=False)

Thanks to rate limits with Instagram's API, after running this one time, the script for photos_user produces data from the following username's. These are not all the usernames though.

array(['06gomes', '247batco', '44_hs', '69restocoupe', '69restocoupe5',
   '9vmarie', '__dsheltonnn', '__eleiguess_', '__pt03', '_alexiskyy_',
   '_baileyjameson', '_coltonpetersen', '_daanyaaa', '_emilio_hdz',
   '_itzxdillon', '_jacobarruda', '_jordy_13', '_ka28den',
   '_kendallrjones_', '_kiramckechnie', '_michaelveto32',
   '_raeeeofsunshine_', 'a.perrrko', 'a1__doe', 'a__jeezy',
   'aaarruda_27', 'aandreyes_pa17', 'aaron_arruda1',
   'aaronshortridge13', 'aaronstrongggg', 'abbigayleroberts',
   'ac_investaslab', 'adam_bernero', 'agambyy', 'aidan_s32',
   'aj_fit707', 'ajcipolla', 'alanhuston75', 'alexdirickson',
   'alexi_s_l.ove', 'alexn145', 'allyvannoy', 'alphonsograyson',
   'altoonabrookies', 'altoonacurve', 'altoonacurve_pics',
   'amanda_lee_24', 'amurphywriter', 'anari310', 'andy.tygrest_22',
   'annajaner1997', 'anthony6__', 'anthonydeldotto', 'antoinemistico',
   'aquasox.news', 'arianitaaa_', 'artravs', 'asap_ant5',
   'ashton_bryanreynoldsfan_2021', 'asmitty85', 'austin_guibor',
   'austin_shenton', 'ayden.hartman32', 'aydenw2009', 'ayejaybabe',
   'ayeyonish10', 'b.clifton13', 'b.dixon3', 'b.m.williamson_',
   'b45baseball', 'baby.jaane', 'bae_ji_hwan', 'baircountry',
   'balancedbridgesports', 'balmybuffalo1', 'barsottid',
   'baseball_for_life77', 'battlebornfitted', 'baweyandt',
   'beccazook', 'bennymiller__', 'bgome35', 'bigleagueadv',
   'bigleaguefaith', 'bjperez05', 'bk_bennyandthejets',
   'blakes_battle06', 'blakesabol', 'blaketownsend_',
   'blaze_mcknight081', 'bleezie4', 'bligh_madris', 'br.andon8763',
   'brad_queso', 'bradenbishop1', 'bradleychapman123', 'bradybasso',
   'brayden_ramsey_', 'breframp', 'brendanerwin9', 'brendtcitta',
   'bretny3253', 'brett_e6', 'brock_minich', 'broupe75',
   'brycen_dawley', 'c.ody11', 'cademarlowe', 'cailuuuhh',
   'calraleigh_', 'calvin.mitch', 'camii_araaceli', 'canaansmith_',
   'carsonvitale', 'cashglad12', 'catchingcamp', 'cbroschard15',
   'cesarizturisjr3', 'cese0903', 'chadwoodring', 'chandlerbats',
   'chang__0211', 'chase_leehong', 'chaseywaseyy', 'chefcham22',
   'chefjesus_raya3', 'chelseagildea', 'chipperchastain',
   'chloechedester', 'chris.wright9', 'chrismccready1'], dtype=object)

The objective would then be to run the job again, but when it comes time to collect for photos_user it excludes the usernames above, and runs the next batch of them. Once that finishes, it then appends that list to the final df, and you keep running until all the followers photo data is successfully captured. So the more data that is captured and appended to the final photos_user df, the less that gets pulled the next time I run it.

So the account we're starting with here (players = [['carter.bins', '14304399497']]) has a little over 1000 followers. Of those 1000 or so followers, about 575 of them have public profiles. The first time I ran it, it produced the photo data for 131 of those individuals. So the next time I run it, it will ignore those 131 usernames in followers_df and run the next however many it can run under the rate limit.

Does that make sense? I'm having trouble trying to write the batch job.

Upvotes: 0

Views: 85

Answers (1)

Sachin Salve
Sachin Salve

Reputation: 146

Just the code snippet from your code would look like this.

if (collection.count_documents({"username": followers_df["username"]}, limit=1)!= 0):
    print(
        "photo data has already captured for this username",
    )
else:
    new_list = pull_photo_information(followers_df["username"][i])
    # new_list = pull_photo_information(followers_df[i])
    # print(len(new_list), len(new_list[0]))
    cnt += 1
    list_user_by_user.append(new_list)
    total_user.add(followers_df["username"][i])

Upvotes: 1

Related Questions