How to handle de-serialized data structure

Question

I would like to fetch the timestamp if the Status is Reached and if the Status is Started. But I see a problem here with the current approach that I have used here (maxsplit). Because of the below line that contains ( ) I am getting the error mentioned below. Some of the data that has " " and some doesn't. How we can handle this case? I just need Status, is it possible to filter out Cupdate key?

"detail": "Info from Railway
Train: T12049 
Status: Reached
Cupdate: Arrival Info: On Time.

IRCTC"

Code:

import json
from operator import itemgetter
import pandas as pd

result_fields_getter = itemgetter("pnr_number", "Reservation Date")

def extract_started_reached_tstps(booking_logs):
    if booking_logs is None:
        return None, None
    started = None
    reached = None
    for log in booking_logs:
        if log["pnr_category"] == "booking_via_irctc":
            details = dict(line.split(': ', maxsplit=1) for line in log["detail"].splitlines()[1:])

            if details["Status"] == "Started":
                started = log["tstp"]
            elif details["Status"] == "Reached":
                reached = log["tstp"]

            if started is not None and reached is not None:
                break

    return started, reached


def main():
    with open(r'C:\Users\grknath\Desktop
esponses.json', encoding='utf-8') as json_file:
        data = json.load(json_file)

    results = [
        (
            *result_fields_getter(record["results"]),
            *extract_started_reached_tstps(record["results"]["BookingLogs"]),
        )
        for record in data["ndatas"]
    ]
    print(results)

    df = pd.DataFrame(
        results, columns=["pnr_number", "Creation_Date", "Started", "Reached"]
    )

    df.to_excel(r'C:\Users\grknath\Desktop
esulls.xlsx', index=False)


if __name__ == "__main__":
    main()

response.json

{
"ndatas": [
{   
    "results": {
        "pnr_number": "PNR9087651232",
        "Reservation Date": "2020-09-29T10:33:55.000+0000",
        "Current State": "Waiting List",
        "BookingLogs": [
            {
                "pnr_category": "agent",
                "tstp": "2020-09-29T10:54:56.000+0000",
                "detail": "Booking Closed: Updated customer"
            },
            {
                "pnr_category": "Railway",
                "tstp": "2020-09-29T10:56:41.000+0000",
                "detail": "Tatkal tickets reservation is open"
            },
            {
                "pnr_category": "booking_via_irctc",
                "tstp": "2020-09-29T10:56:54.000+0000",
                "detail": "Info from Railway
Train: T12049 
Status: Started
Cupdate: Functioning on Time"
            },
            {
                "pnr_category": "booking_via_irctc",
                "tstp": "2020-09-30T14:44:34.000+0000",
                "detail": "Info from Railway
Train: T12049 
Status: Reached
Cupdate: On Time"
            },
            {
                "pnr_category": "agent",
                "tstp": "2020-10-01T07:12:20.000+0000",
                "detail": "All bookings Truncated"
            },
            {
                "pnr_category": "booking_via_irctc",
                "tstp": "2020-10-07T15:30:16.000+0000",
                "detail": "Info from Railway
Train: T12049 
Status: Cancelled
Cupdate: Heavy Rain"
            }
        ],
        "from": "Kolkatta",
        "to loc": "Mumbai"
    }   
},
{   
    "results": {
        "pnr_number": "PNR90876512322",
        "Reservation Date": "2020-09-29T10:33:55.000+0000",
        "Current State": "Waiting List",
        "BookingLogs": [
            {
                "pnr_category": "agent",
                "tstp": "2020-09-29T10:54:56.000+0000",
                "detail": "Booking Closed: Updated customer"
            },
            {
                "pnr_category": "Railway",
                "tstp": "2020-09-29T10:56:41.000+0000",
                "detail": "Tatkal tickets reservation is open"
            },
            {
                "pnr_category": "booking_via_irctc",
                "tstp": "2020-09-29T10:56:54.000+0000",
                "detail": "Info from Railway
Train: T12049 
Status: Started
Cupdate: Functioning on Time"
            },
            {
                "pnr_category": "booking_via_irctc",
                "tstp": "2020-09-30T14:44:34.000+0000",
                "detail": "Info from Railway
Train: T12049 
Status: Reached
Cupdate: Arrival Info: On Time.

IRCTC"
            },
            {
                "pnr_category": "agent",
                "tstp": "2020-10-01T07:12:20.000+0000",
                "detail": "All bookings Truncated"
            },
            {
                "pnr_category": "booking_via_irctc",
                "tstp": "2020-10-07T15:30:16.000+0000",
                "detail": "Info from Railway
Train: T12049 
Status: Cancelled
Cupdate: Heavy Rain"
            }
        ],
        "from": "Kolkatta",
        "to loc": "Mumbai"
    }   
}
]
}

Error:

C:\DeveloperArea\PycharmProjects\python-rest-client\venv\Scripts\python.exe C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py
Traceback (most recent call last):
  File "C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py", line 48, in 
    main()
  File "C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py", line 31, in main
    results = [
  File "C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py", line 34, in 
    *extract_started_reached_tstps(record["results"]["BookingLogs"]),
  File "C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py", line 14, in extract_started_reached_tstps
    details = dict(line.split(': ', maxsplit=1) for line in log["detail"].splitlines()[1:])
ValueError: dictionary update sequence element #3 has length 1; 2 is required

Process finished with exit code 1

How to handle de-serialized data structure

Answers (1)

Related Questions