how to import state of pandas dataframe to second .py file

Question

so, toward the end of my first file; we'll call /file.py.

def get_excel_data(self):
        """Places excel data into pandas dataframe"""

        # excel_data = pandas.read_excel(self.find_file())

        for extracted_archive in self.find_file():
            excel_data = pandas.read_excel(extracted_archive)

            # print(excel_data)

            columns = pandas.DataFrame(columns=excel_data.columns.tolist())
            excel_data = pandas.concat([excel_data, columns])

            excel_data.columns = excel_data.columns.str.strip()
            excel_data.columns = excel_data.columns.str.replace("/", "_")
            excel_data.columns = excel_data.columns.str.replace(" ", "_")

            total_records = 0
            num_valid_records = 0
            num_invalid_records = 0

            for row in excel_data.itertuples():
                mrn = row.MRN
                total_records += 1

                if mrn in ("", " ", "N/A", "NaT", "NaN", None) or math.isnan(mrn):
                    # print(f"Invalid record: {row}")
                    num_invalid_records += 1
                    # total_invalid = num_invalid_records + dup_count
                    excel_data = excel_data.drop(excel_data.index[row.Index])
                    # continue
                else:
                    # print(mrn) # outputs all MRN ids
                    for row in excel_data.itertuples():
                        num_valid_records += 1

                        continue   

            with open("./logs/metrics.csv", "a", newline="
") as f:
                csv_writer = DictWriter(f, ['date', 'total_records', 'processed', 'skipped', 'success_rate'])
                # csv_writer.writeheader()

                currentDT = datetime.datetime.now()
                success_rate = num_valid_records / total_records * 100
                csv_writer.writerow(dict(date=currentDT,
                                        total_records=total_records,
                                        processed=num_valid_records,
                                        skipped=num_invalid_records,
                                        success_rate=num_valid_records / total_records * 100))

            return self.clean_data_frame(excel_data)

    def clean_data_frame(self, data_frame):
        """Cleans up dataframes"""
        for col in data_frame.columns:
            if "date" in col.lower():
                data_frame[col] = pandas.to_datetime(data_frame[col],
                                                     errors='coerce', infer_datetime_format=True)
                data_frame[col] = data_frame[col].dt.date
        data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)

        return data_frame

    def get_mapping_data(self):
        map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
        columns = pandas.DataFrame(columns=map_data.columns.tolist())
        return pandas.concat([map_data, columns])

in my second file I would like to keep that end state; and do another iteration for instance.... second_file.py

def process_records(self, records, map_data, completed=None, errors=None):
    """Code to execute after webdriver initialization."""
    series_not_null = False
    try:
        num_attempt = 0

        for record in data_frame.itertuples(): # not working
            print(record)
            series_not_null = True
            mrn = record.MRN

            self.navigate_to_search(num_attempt)
            self.navigate_to_member(mrn)
            self.navigate_to_assessment()
            self.add_assessment(record, map_data)
            self.driver.switch_to.parent_frame() # not working
            sleep(.5)

            error_flag = self.close_member_tab(self.driver, mrn, error_flag)

    except Exception as exc:
        if series_not_null:
            errors = self.process_series_error(exc)

    return completed, error

both have import pandas

Jacobr365 · Accepted Answer

you can save your dataframe in a pickle file like this. it is also worth noting that you can store most anything in a pickle file. here is a link to some info here: pickle info

import pandas as pd
import pickle

x = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})

#this will create a file called pickledata.p that will store the data frame
with open('pickledata.p', 'wb') as fh:   #notice that you need the 'wb' for the dump
    pickle.dump(x, fh)

#to load the file do this
with open('pickledata.p', 'rb') as fh:   #you need to use 'rb' to read
    df = pickle.load(fh)

#you can now use df like a normal dataframe
print(df)

you dont actually need the '.p' extension for a pickle file, i just like it.

so you save your dataframe at the end of script one, and then load it in at the start of script 2.

how to import state of pandas dataframe to second .py file

Answers (2)

Related Questions