Reputation: 580
I am trying to get the data from Google analytics Reporting API, the problem is that the first request quickly gets the data but the next request is taking around 5 mins.Does it happen because of my pagination code which is happening in while loop?
My Page Size is "300". I used this size to check my pagination.
from apiclient.discovery import build
from time import sleep
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = 'analytcisproject.json'
VIEW_ID = '182363141'
#dd = []
#for pag_index in range(0, 50):
# dd.append(service.data().ga().get( ids='ga:', start_date='2018-04-01', end_date='2018-04-30', dimensions = "ga:date,ga:dimension2", metrics='ga:pageviews', start_index=str(pag_index*10000+1), max_results=str(pag_index*10000+10000)).execute())
#print(dd)
def initialize_analyticsreporting():
credentials = ServiceAccountCredentials.from_json_keyfile_name(KEY_FILE_LOCATION, SCOPES)
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
def get_report(analytics, pageTokenVariable):
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': '50daysAgo', 'endDate': 'today'}],
'metrics': [{'expression': 'ga:sessions'},{'expression': 'ga:users'},{'expression': 'ga:pageviews'}
,{'expression': 'ga:sessionDuration'},{'expression': 'ga:timeOnPage'},{'expression': 'ga:pageLoadTime'},{'expression': 'ga:bounceRate'}],
'dimensions': [{'name': 'ga:date'},{'name': 'ga:browser'},
{'name':'ga:city'},{'name':'ga:country'},{'name':'ga:medium'},
{'name':'ga:continent'},{'name':'ga:landingScreenName'},{'name':'ga:pagePath'}
,{'name':'ga:userType'}],
"pageToken": pageTokenVariable,
"pageSize": "300",
}]
}
).execute()
analytics = initialize_analyticsreporting()
response = get_report(analytics, "0")
for report in response.get('reports', []):
j=0
list = []
pagetoken = report.get('nextPageToken', None)
print(pagetoken)
#------printing the pagetoken here returns `100,000` which is expected
for report in response.get('reports', []):
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
if ',' in value or '.' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = int(value)
list.append(dict)
j=j+1
print(j)
#print(response)
# Append that data to a list as a dictionary
# pagination function
while pagetoken != "" and pagetoken != None: # This says while there is info in the nextPageToken get the data, process it and add to the list
i=0
response = get_report(analytics, pagetoken)
#pagetoken = response['reports'][0]['nextPageToken']
pagetoken = response['reports'][0].get('nextPageToken')
print(pagetoken)
#------printing the pagetoken here returns `200,000` as is expected but the data being pulled is the same as for the first batch and so on. While in the loop the pagetoken is being incremented but it does not retrieve new data
for row in rows:
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
if ',' in value or '.' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = int(value)
list.append(dict)
sleep(1)
i=i+1
print(i)
#df = pd.DataFrame(list)
if i==3:
break;
print('afn')
df = pd.DataFrame(list)
#df.head()
#response
Upvotes: 0
Views: 210
Reputation: 431
Sorry I'm not a python expert. You have a "sleep(1)" in your row loop, so 1 second for each row = 300 seconds per row (5 mins). that would make the most sense to me.
Try removing the sleep in your row loop.
I hope that helps.
Upvotes: 1