Reputation: 589
I would like to fetch the timestamp if the Status is Reached and if the Status is Started. But I see a problem here with the current approach that I have used here (maxsplit). Because of the below line that contains (\n\n) I am getting the error mentioned below. Some of the data that has "\n\n" and some doesn't. How we can handle this case? I just need Status, is it possible to filter out Cupdate key?
"detail": "Info from Railway\nTrain: T12049 \nStatus: Reached\nCupdate: Arrival Info: On Time.\n\nIRCTC"
Code:
import json
from operator import itemgetter
import pandas as pd
result_fields_getter = itemgetter("pnr_number", "Reservation Date")
def extract_started_reached_tstps(booking_logs):
if booking_logs is None:
return None, None
started = None
reached = None
for log in booking_logs:
if log["pnr_category"] == "booking_via_irctc":
details = dict(line.split(': ', maxsplit=1) for line in log["detail"].splitlines()[1:])
if details["Status"] == "Started":
started = log["tstp"]
elif details["Status"] == "Reached":
reached = log["tstp"]
if started is not None and reached is not None:
break
return started, reached
def main():
with open(r'C:\Users\grknath\Desktop\responses.json', encoding='utf-8') as json_file:
data = json.load(json_file)
results = [
(
*result_fields_getter(record["results"]),
*extract_started_reached_tstps(record["results"]["BookingLogs"]),
)
for record in data["ndatas"]
]
print(results)
df = pd.DataFrame(
results, columns=["pnr_number", "Creation_Date", "Started", "Reached"]
)
df.to_excel(r'C:\Users\grknath\Desktop\resulls.xlsx', index=False)
if __name__ == "__main__":
main()
response.json
{
"ndatas": [
{
"results": {
"pnr_number": "PNR9087651232",
"Reservation Date": "2020-09-29T10:33:55.000+0000",
"Current State": "Waiting List",
"BookingLogs": [
{
"pnr_category": "agent",
"tstp": "2020-09-29T10:54:56.000+0000",
"detail": "Booking Closed: Updated customer"
},
{
"pnr_category": "Railway",
"tstp": "2020-09-29T10:56:41.000+0000",
"detail": "Tatkal tickets reservation is open"
},
{
"pnr_category": "booking_via_irctc",
"tstp": "2020-09-29T10:56:54.000+0000",
"detail": "Info from Railway\nTrain: T12049 \nStatus: Started\nCupdate: Functioning on Time"
},
{
"pnr_category": "booking_via_irctc",
"tstp": "2020-09-30T14:44:34.000+0000",
"detail": "Info from Railway\nTrain: T12049 \nStatus: Reached\nCupdate: On Time"
},
{
"pnr_category": "agent",
"tstp": "2020-10-01T07:12:20.000+0000",
"detail": "All bookings Truncated"
},
{
"pnr_category": "booking_via_irctc",
"tstp": "2020-10-07T15:30:16.000+0000",
"detail": "Info from Railway\nTrain: T12049 \nStatus: Cancelled\nCupdate: Heavy Rain"
}
],
"from": "Kolkatta",
"to loc": "Mumbai"
}
},
{
"results": {
"pnr_number": "PNR90876512322",
"Reservation Date": "2020-09-29T10:33:55.000+0000",
"Current State": "Waiting List",
"BookingLogs": [
{
"pnr_category": "agent",
"tstp": "2020-09-29T10:54:56.000+0000",
"detail": "Booking Closed: Updated customer"
},
{
"pnr_category": "Railway",
"tstp": "2020-09-29T10:56:41.000+0000",
"detail": "Tatkal tickets reservation is open"
},
{
"pnr_category": "booking_via_irctc",
"tstp": "2020-09-29T10:56:54.000+0000",
"detail": "Info from Railway\nTrain: T12049 \nStatus: Started\nCupdate: Functioning on Time"
},
{
"pnr_category": "booking_via_irctc",
"tstp": "2020-09-30T14:44:34.000+0000",
"detail": "Info from Railway\nTrain: T12049 \nStatus: Reached\nCupdate: Arrival Info: On Time.\n\nIRCTC"
},
{
"pnr_category": "agent",
"tstp": "2020-10-01T07:12:20.000+0000",
"detail": "All bookings Truncated"
},
{
"pnr_category": "booking_via_irctc",
"tstp": "2020-10-07T15:30:16.000+0000",
"detail": "Info from Railway\nTrain: T12049 \nStatus: Cancelled\nCupdate: Heavy Rain"
}
],
"from": "Kolkatta",
"to loc": "Mumbai"
}
}
]
}
Error:
C:\DeveloperArea\PycharmProjects\python-rest-client\venv\Scripts\python.exe C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py
Traceback (most recent call last):
File "C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py", line 48, in <module>
main()
File "C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py", line 31, in main
results = [
File "C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py", line 34, in <listcomp>
*extract_started_reached_tstps(record["results"]["BookingLogs"]),
File "C:/DeveloperArea/PycharmProjects/python-rest-client/demo_data.py", line 14, in extract_started_reached_tstps
details = dict(line.split(': ', maxsplit=1) for line in log["detail"].splitlines()[1:])
ValueError: dictionary update sequence element #3 has length 1; 2 is required
Process finished with exit code 1
Upvotes: 0
Views: 46
Reputation: 95957
Your question really has nothing to do with json. The problem is how to handle the deserialized data structure, and really, in particular, how to parse this string. I think you should probably just use regex, here's a pattern that splits only on solitary newlines, using a negative lookbehind and a negative lookahead:
>>> regex = re.compile("(?<!\n)\n(?!\n)")
>>> regex.split(s)
['Info from Railway', 'Train: T12049 ', 'Status: Reached', 'Cupdate: Arrival Info: On Time.\n\nIRCTC']
But if all you care about is the status, just use a regex to capture that, maybe with something as simple as all non-whitespace...:
>>> regex = re.compile(r"Status: (\S+)")
>>> regex.search(s).group()
'Status: Reached'
>>> regex.search(s).group(1)
'Reached'
Upvotes: 1