Using Python and pyxlsb lookup data from xlsb

Question

Goals I have a workbook with several sheets in it. The sheet format always has time in column 1 in seconds and headers on the top row. I want code to look up a particular time range and column header in a sheet and then find the average of that data and paste it into a csv.

Problem Code is incredibly slow, is there a better method? Sheets are 30,000 rows 300 columns. Code only works with modified reduced data set

import pandas as pd
from pyxlsb import open_workbook
import datetime

print("Start", datetime.datetime.now())
# Define the paths
csv_file = r"L:\Projects\P1563 V4\Data\Summary\ANR Sweeps\Python\ANR Sweeps_Python_simple.csv"

# Read the CSV file (without headers)
csv_data = pd.read_csv(csv_file, encoding='ISO-8859-1', header=None)

# Extract parameters from the CSV file using numeric indexing
xlsb_path = csv_data.iloc[2, 0]  # Cell A3
xlsb_file = csv_data.iloc[2, 1]  # Cell B3
sheet_name = csv_data.iloc[0, 4]  # Cell E1
column_header = csv_data.iloc[1, 4]  # Cell E2
start_time = float(csv_data.iloc[2, 2])  # Cell C3
end_time = float(csv_data.iloc[2, 3])  # Cell D3

print("CSV open", datetime.datetime.now())


# Full path to the .xlsb file
xlsb_full_path = f"{xlsb_path}{xlsb_file}.xlsb"

with open_workbook(xlsb_full_path) as wb:
    with wb.get_sheet(sheet_name) as sheet:
        # Get the first row (assuming headers are in the first row)
        headers = next(sheet.rows())
        
        # Create a mapping of header name to its index
        header_map = {cell.v: idx for idx, cell in enumerate(headers)}
        
        # Directly access the index of the desired column header
        if column_header in header_map:
            column_index = header_map[column_header]
            print(f"Header '{column_header}' is at index: {column_index}")
        else:
            print(f"Header '{column_header}' not found.")

        # Get the indices of the required columns
        data_col_idx = header_map[column_header]
        time_col_idx = 0

        # Initialize variables for sum and count
        total = 0
        count = 0
        next(sheet.rows()) #skip column headers

   # Iterate through the rows
        for row in sheet.rows():
            try:
                # Get the time and data values
                time_value = row[time_col_idx].v
                data_value = row[data_col_idx].v
                # Use this to skip column header
                if isinstance(time_value, float):
                    if start_time <= time_value <= end_time:
                        total += data_value
                        count += 1
            except IndexError:
                # Skip rows with missing data
                continue

        # Calculate the average
        if count > 0:
            average = total / count
            print(f"Average of '{column_header}' between {start_time} and {end_time}: {average}")
        else:
            print(f"No valid data found in column '{column_header}' between {start_time} and {end_time}.")

Using Python and pyxlsb lookup data from xlsb

Answers (1)

repeated queries

junk columns

RDBMS

Related Questions