Export data to gsheet workbooks / worksheets after looping through a script 10 times

Question

I have the following script I'm running to get data from Google's pagespeed insights tool via API:

from datetime import datetime
from urllib import request
import requests
import pandas as pd
import numpy as np
import re
from os import truncate
import xlsxwriter
import time
import pygsheets
import pickle

domain_strip = 'https://www.example.co.uk'
gc = pygsheets.authorize(service_file='myservicefile.json')
API = "myapikey"
strat = "mobile"

def RunCWV():
    with open('example_urls_feb_23.txt') as pagespeedurls:
        content = pagespeedurls.readlines()
        content = [line.rstrip('
') for line in content]
    
#Dataframes
    dfCWV2 = pd.DataFrame({'Page':[],'Overall Performance Score':[],'FCP (seconds) CRUX':[],'FCP (seconds) Lab':[],'FID (seconds)':[],'Max Potential FID (seconds)':[],'LCP (seconds) CRUX':[],'LCP (seconds) Lab':[],'LCP Status':[],'CLS Score CRUX':[],'Page CLS Score Lab':[],'CLS Status':[],'Speed Index':[],'Uses Efficient Cache Policy?':[],'Landing Page':[]})        
    dfCLSPath2 = pd.DataFrame({'Page':[],'Path':[],'Selector':[],'Node Label':[],'Element CLS Score':[],'Landing Page':[],'large_uid':[]})
    dfUnsizedImages2 = pd.DataFrame({'Page':[],'Image URL':[],'Landing Page':[],'unsized_uid':[]}) 
    dfNCAnim2 = pd.DataFrame({'Page':[],'Animation':[],'Failure Reason':[],'Landing Page':[]})
    dfLCP_Overview = pd.DataFrame({'Page':[],'Preload LCP Savings (seconds)':[],'Resize Images Savings (seconds)':[],'Text Compression Savings (seconds)':[],'Preload Key Requests Savings (seconds)':[],'Preconnect Savings (seconds)':[],'Unused CSS Savings (seconds)':[],'Unused JS Savings (seconds)':[],'Unminified CSS Savings (seconds)':[],'Unminified JS Savings (seconds)':[],'Efficiently Animated Content Savings':[],'Landing Page':[]})       
    dfLCPOb2 = pd.DataFrame({'Page':[],'LCP Tag':[],'LCP Tag Type':[],'LCP Image Preloaded?':[],'Wasted Seconds':[],'Action':[],'Landing Page':[]})
    dfresize_img = pd.DataFrame({'Page':[],'Image URL':[],'Total Bytes':[],'Wasted Bytes':[],'Overall Savings (seconds)':[],'Action':[],'Landing Page':[]})
    dfFontDisplay2 = pd.DataFrame({'Page':[],'Resource':[],'Font Display Utilised?':[],'Wasted Seconds':[],'Action':[],'Landing Page':[]})
    dfTotalBW2 = pd.DataFrame({'Page':[],'Total Byte Weight of Page':[],'Large Network Payloads?':[],'Resource':[],'Total KB':[],'Landing Page':[]})
    dfRelPreload2 = pd.DataFrame({'Page':[],'Resource':[],'Wasted Seconds':[],'Landing Page':[]})
    dfRelPreconnect2 = pd.DataFrame({'Page':[],'Resource':[],'Wasted Ms':[],'Passed Audit':[],'Landing Page':[]})
    dfTextCompression2 = pd.DataFrame({'Page':[],'Text Compression Optimal?':[],'Action':[],'Savings':[],'Landing Page':[]})
    dfUnusedCSS2 = pd.DataFrame({'Page':[],'CSS File':[],'Unused CSS Savings KiB':[],'Unused CSS Savings (seconds)':[],'Wasted %':[],'Landing Page':[]})
    dfUnusedJS2 = pd.DataFrame({'Page':[],'JS File':[],'Unused JS Savings (seconds)':[],'Total Bytes':[],'Wasted Bytes':[],'Wasted %':[],'Landing Page':[]})
    dfUnminCSS2 = pd.DataFrame({'Page':[],'CSS File':[],'Total Bytes':[],'Wasted Bytes':[],'Wasted %':[],'Landing Page':[]})
    dfUnminJS2 = pd.DataFrame({'Page':[],'JS File':[],'Total Bytes':[],'Wasted Bytes':[],'Wasted %':[],'Landing Page':[]})
    dfCritRC2 = pd.DataFrame({'Page':[],'Resource':[],'Start Time':[],'End Time':[],'Total Time':[],'Transfer Size':[],'Landing Page':[]})
    dfAnimContent2 = pd.DataFrame({'Page':[],'Efficient Animated Content?':[],'Resource':[],'Total Bytes':[],'Wasted Bytes':[],'Landing Page':[]})
    dfSRT2 = pd.DataFrame({'Page':[],'Passed Audit?':[],'Server Response Time ms':[],'Server Response Time Savings':[],'Landing Page':[]})
    dfRedirects2 = pd.DataFrame({'Page':[],'Redirects':[],'Wasted ms':[],'Landing Page':[]})
    dfFID_Summary2 = pd.DataFrame({'Page':[],'FID (seconds)':[],'Total Blocking Time (seconds)':[],'FID Rating':[],'Total Tasks':[],'Total Task Time of Page (seconds)':[],'Tasks over 50ms':[],'Tasks over 100ms':[],'Tasks over 500ms':[],'3rd Party Total Wasted Seconds':[],'Bootup Time (seconds)':[],'Number of Dom Elements':[],'Mainthread work Total Seconds':[],'Duplicate JS Savings (Seconds)':[],'Legacy JS Savings (seconds)':[],'Landing Page':[]})
    dflongTasks2 = pd.DataFrame({'Page':[],'Task':[],'Task Duration Seconds':[],'Total Tasks':[],'Total Task Time of Page (seconds)':[],'Tasks over 50ms':[],'Tasks over 100ms':[],'Tasks over 500ms':[],'Landing Page':[]})
    dfthirdP2 = pd.DataFrame({'Page':[],'3rd Party Total wasted Seconds':[],'3rd Party Total Blocking Time (seconds)':[],'3rd Party Resource Name':[],'Landing Page':[]})
    dfbootup2 = pd.DataFrame({'Page':[],'Page Bootup Time Score':[],'Resource':[],'Time spent Parsing / Compiling Ms':[]})
    dfthread2 = pd.DataFrame({'Page':[],'Score':[],'Mainthread work total seconds':[],'Mainthread work Process Type':[],'Duration (Seconds)':[],'Landing Page':[]})
    dfDOM2 = pd.DataFrame({'Page':[],'Dom Size Score':[],'DOM Stat':[],'DOM Value':[],'Landing Page':[],})
    dfdupJS2 = pd.DataFrame({'Page':[],'Score':[],'Audit Status':[],'Duplicate JS Savings (seconds)':[], 'Landing Page':[]})
    dflegacyJS2 = pd.DataFrame({'Page':[],'Audit Status':[],'Legacy JS Savings (seconds)':[],'JS File of Legacy Script':[],'Wasted Bytes':[],'Landing Page':[]})
#Run PSI
    for line in content:
        x = f'https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={line}&strategy={strat}&key={API}'
        print(f'Running CWV Audit on {strat} from: {line} - Please Wait...')
        r = requests.get(x)
        data = r.json()
        line_stripped = line
        if domain_strip in line:
            line_stripped = line_stripped.replace(domain_strip, '')
        else:
            pass
#CWV Overview
        try:
            op_score = data["lighthouseResult"]["categories"]["performance"]["score"] * 100
            fcp_score_CRUX = data["loadingExperience"]["metrics"]["FIRST_CONTENTFUL_PAINT_MS"]["percentile"] / 1000
            fcp_score_LAB = data["lighthouseResult"]["audits"]["first-contentful-paint"]["numericValue"] / 1000
            fid_score = data["loadingExperience"]["metrics"]["FIRST_INPUT_DELAY_MS"]["percentile"] / 1000
            Max_P_FID = data["lighthouseResult"]["audits"]["max-potential-fid"]["numericValue"] / 1000
            lcp_score_CRUX_ms = data["loadingExperience"]["metrics"]["LARGEST_CONTENTFUL_PAINT_MS"]["percentile"]
            lcp_score_CRUX = data["loadingExperience"]["metrics"]["LARGEST_CONTENTFUL_PAINT_MS"]["percentile"] / 1000
            lcp_score_LAB = data["lighthouseResult"]["audits"]["first-contentful-paint"]["numericValue"] / 1000
            cls_score_Sitewide = data["loadingExperience"]["metrics"]["CUMULATIVE_LAYOUT_SHIFT_SCORE"]["percentile"] / 100
            cls_score_Page_mult = data["lighthouseResult"]["audits"]["cumulative-layout-shift"]["numericValue"] * 1000
            cls_score_Page = data["lighthouseResult"]["audits"]["cumulative-layout-shift"]["numericValue"]
            speed_index = data["lighthouseResult"]["audits"]["speed-index"]["numericValue"] / 1000
            efficient_cache = data["lighthouseResult"]["audits"]["uses-long-cache-ttl"]["score"]
            if efficient_cache == 1:
                efficient_cache = "Yes"
            else:
                efficient_cache = "No"
            lcp_status = lcp_score_CRUX_ms
            if lcp_score_CRUX_ms <=2500:
                lcp_status = "Good" 
            elif lcp_score_CRUX_ms in range (2501, 4000):
                lcp_status = "Needs Improvement"
            else:
                lcp_status = "Poor"
            cls_status = cls_score_Page_mult
            if cls_score_Page_mult <=100:
                cls_status = "Good" 
            elif cls_score_Page_mult in range (101,150):
                cls_status = "Needs Improvement"
            else:
                cls_status = "Poor"
                
            new_row = pd.DataFrame({'Page':line_stripped,'Overall Performance Score':op_score, 'FCP (seconds) CRUX':round(fcp_score_CRUX,4),'FCP (seconds) Lab':round(fcp_score_LAB,4), 'FID (seconds)':round(fid_score,4),
                                    'Max Potential FID (seconds)':round(Max_P_FID,4), 'LCP (seconds) CRUX':round(lcp_score_CRUX,4),'LCP (seconds) Lab':round(lcp_score_LAB,4), 'LCP Status':lcp_status, 'CLS Score CRUX':round(cls_score_Sitewide,4), 
                                    'Page CLS Score Lab':round(cls_score_Page,4),'CLS Status':cls_status,'Speed Index':round(speed_index,4),'Uses Efficient Cache Policy?':efficient_cache, 'Landing Page':line_stripped}, index=[0])        
            dfCWV2 = pd.concat([dfCWV2, new_row], ignore_index=True) #, ignore_index=True
        except KeyError:                    
            print(f' CWV Summary One or more keys not found {line}.')
        except TypeError:                    
            print(f'TypeError on {line}.')

        print ('CWV Summary')
        print (dfCWV2)

#Export to GSheets line by line
        sh = gc.open('CWV Overview AWP - example Feb 2023')
        worksheet = sh.worksheet_by_title('CWV')
        df_worksheet = worksheet.get_as_df()
        result = pd.concat([df_worksheet, dfCWV2], ignore_index=True)
        result=result.drop_duplicates(keep='last')
        worksheet.set_dataframe(result, 'A1') 
        # #End test
#CLS
    #Large Shifts        
        try:
            for x in range (len(data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"])):
                path = data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"][x]["node"]["path"]
                selector = data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"][x]["node"]["selector"]
                nodeLabel = data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"][x]["node"]["nodeLabel"]
                score = data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"][x]["score"] 
                i = 1          
                new_row = pd.DataFrame({'Page':line_stripped, 'Path':path, 'Selector':selector, 'Node Label':nodeLabel,'Element CLS Score':round(score,4), 'Landing Page':line_stripped, 'large_uid':i}, index=[0])        
                dfCLSPath2 = pd.concat([dfCLSPath2, new_row], ignore_index=True)
        except KeyError:                    
            print(f' Layout Shift Elements - One or more keys not found {line}.')
        except TypeError:                    
            print(f'TypeError on {line}.')
        print ('Large Shifts')
        print (dfCLSPath2)

        sh = gc.open('CLS Audit AWP - example Feb 2023')
        worksheet = sh.worksheet_by_title('Large CLS Elements')
        df_worksheet = worksheet.get_as_df()
        result = pd.concat([df_worksheet, dfCLSPath2], ignore_index=True)
        result=result.drop_duplicates(keep='last')
        worksheet.set_dataframe(result, 'A1') 
    #Unsized Images
        try:
            for x in range (len(data["lighthouseResult"]["audits"]["unsized-images"]["details"]["items"])):
                unsized_url = data["lighthouseResult"]["audits"]["unsized-images"]["details"]["items"][x]["url"]
                i = 1
                new_row = pd.DataFrame({'Page':line_stripped, 'Image URL':unsized_url, 'Landing Page':line_stripped, 'unsized_uid':i}, index=[0]) 
                dfUnsizedImages2 = pd.concat([dfUnsizedImages2, new_row], ignore_index=True)                          
        except KeyError:                    
            print(f' Unsized Images One or more keys not found {line}.') 
        except TypeError:                    
            print(f'TypeError on {line}.')
        print ('Unsized Images')
        print(dfUnsizedImages2)

  
        sh = gc.open('CLS Audit AWP - example Feb 2023')
        worksheet = sh.worksheet_by_title('Unsized Images')
        df_worksheet = worksheet.get_as_df()
        result = pd.concat([df_worksheet, dfUnsizedImages2], ignore_index=True)
        result=result.drop_duplicates(keep='last')
        worksheet.set_dataframe(result, 'A1')

I've only included the first few TRY blocks as the script is very long. Essentially what I want to do is the same as I have here, but rather than exporting the results from the dataframes after every URL has run, I want to export it, say, every 10 urls (or more). I have around 4000 urls in total and I need to capture the results from the audit for every url

I used to have the script set up to export to gsheets at the end of the whole script with every loop, but I always end up with the script crashing before it loops through every URL I'm auditing which is why I set it up as above to export line by line - it's SUPER slow though, taking over 2 weeks to run through all urls in my text file so I want to speed it up by only exporting every 10 urls worth of data at a time. That way if the script crashes, I've not lost everything, only the last 10 urls.

I tried setting a counter on each of the export blocks:

results = []
results_to_export = []
for i in range(10):
    counter = 0
    while counter < 5000:
        print("Starting loop iteration")
        results.append(dfCWV2)
        counter += 1
    if counter % 10 == 0:
        print("Running after 10 loops")
        result=pd.concat(results, ignore_index=True)
        result=result.drop_duplicates(keep='last')
        # add results to export list
        results_to_export.append(result)
        if results_to_export:
            sh = gc.open('CWV Overview AWP - example Feb 2023')
            worksheet = sh.worksheet_by_title('CWV')
            combined_results = pd.concat(results_to_export, ignore_index=True)
            worksheet.set_dataframe(combined_results, 'A1') 
            results_to_export.clear()  
            results =[]

But this just kept looping through the while loop and not moving onto the next Try block or throwing up errors (I tried every version of unindenting the if statements too but nothing worked).

Please help!

Export data to gsheet workbooks / worksheets after looping through a script 10 times

Answers (1)

A shorter program would be more likely to get an expert answer

Is your `if counter` statement not indented enough?

Related Questions

Export data to gsheet workbooks / worksheets after looping through a script 10 times

Answers (1)

A shorter program would be more likely to get an expert answer

Is your if counter statement not indented enough?

Related Questions

Is your `if counter` statement not indented enough?