Reputation: 343
I'm trying to read the Json returned by an API and play it to a DATAFRAME pyspar, but the file comes out in just one field named _corrupt_record:
my JSON is something like this:
{
"title": "texto",
"nickname": "(app)",
"language": "en",
"folder_id": "0",
"category": "",
"question_count": 3,
"page_count": 3,
"response_count": 2636,
"date_created": "2021-01-27T19:22:00",
"date_modified": "2021-06-01T11:43:00",
"id": "00000",
"buttons_text": {
"next_button": "next",
"prev_button": "prev",
"done_button": "done",
"exit_button": ""
},
"is_owner": true,
"footer": false,
"custom_variables": {
},
"href": "https://api",
"analyze_url": "https://api",
"edit_url": "https://api",
"collect_url": "https://api",
"summary_url": "https://api",
"preview": "https://api",
"pages": [
{
"title": "",
"description": "",
"position": 1,
"question_count": 1,
"id": "00000",
"href": "https://api",
"questions": [
{
"id": "602406071",
"position": 1,
"visible": true,
"family": "matrix",
"subtype": "rating",
"layout": null,
"sorting": null,
"required": {
"text": "texto",
"type": "all",
"amount": "0"
},
"validation": null,
"forced_ranking": false,
"headings": [
{
"heading": "texto"
}
],
"href": "https://api",
"answers": {
"rows": [
{
"position": 1,
"visible": true,
"text": "",
"id": "00000"
}
],
"choices": [
{
"position": 1,
"visible": true,
"text": "texto",
"id": "00000",
"is_na": false,
"weight": 0,
"description": ""
},
{
"position": 2,
"visible": true,
"text": "texto",
"id": "00000",
"is_na": false,
"weight": 0,
"description": ""
},
{
"position": 3,
"visible": true,
"text": "texto",
"id": "00000",
"is_na": false,
"weight": 0,
"description": ""
},
{
"position": 4,
"visible": true,
"text": "texto",
"id": "00000",
"is_na": false,
"weight": 0,
"description": ""
},
{
"position": 5,
"visible": true,
"text": "texto",
"id": "00000",
"is_na": false,
"weight": 0,
"description": ""
}
]
},
"display_options": {
"show_display_number": true,
"display_type": "emoji",
"display_subtype": "star",
"left_label_id": null,
"left_label": "",
"right_label_id": null,
"right_label": "",
"middle_label_id": null,
"middle_label": "",
"custom_options": {
"color": "#f5a623",
"option_set": [
]
}
}
}
]
}
]
}
I need it to be separated by columns, including the aligned fields, I want to separate by columns:
I'm trying it:
endpoint = f"my_API"
headers = {'Authorization': subscription_key}
request = requests.get(endpoint, headers=headers)
aall_df = json.loads(request.content)
rdd = sc.parallelize([aall_df])
df1 = spark.read.json(rdd)
df1.show(truncate=False)
Expected output:
title | nickname | language | folder_id | category | question_count | page_count | date_created | date_modified | id | next_button | prev_button | ... |
---|---|---|---|---|---|---|---|---|---|---|---|---|
texto | (app) | en | 0 | nan | 3 | 3 | 2021-01-27... | 2021-01-27... | 0 | next | prev | ... |
Can someone help me, please?
Upvotes: 1
Views: 1260
Reputation: 10035
Pass the string/unparsed json to the rdd instead of the parsed json
aall_df = request.content # json.loads(request.content)
rdd = sc.parallelize([aall_df])
df1 = spark.read.json(rdd)
df1.show(truncate = False)
prints:
+-----------+--------------------+--------+-----------+-------------------+-------------------+-----------+---------+------+-----------+-----+--------+--------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+--------------+--------------+-----------+-----+
|analyze_url|buttons_text |category|collect_url|date_created |date_modified |edit_url |folder_id|footer|href |id |is_owner|language|nickname|page_count|pages |preview |question_count|response_count|summary_url|title|
+-----------+--------------------+--------+-----------+-------------------+-------------------+-----------+---------+------+-----------+-----+--------+--------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+--------------+--------------+-----------+-----+
|https://api|{done, , next, prev}| |https://api|2021-01-27T19:22:00|2021-06-01T11:43:00|https://api|0 |false |https://api|00000|true |en |(app) |3 |[{, https://api, 00000, 1, 1, [{{[{, 00000, false, 1, texto, true, 0}, {, 00000, false, 2, texto, true, 0}, {, 00000, false, 3, texto, true, 0}, {, 00000, false, 4, texto, true, 0}, {, 00000, false, 5, texto, true, 0}], [{00000, 1, , true}]}, {{#f5a623, []}, star, emoji, , null, , null, , null, true}, matrix, false, [{texto}], https://api, 602406071, null, 1, {0, texto, all}, null, rating, null, true}], }]|https://api|3 |2636 |https://api|texto|
+-----------+--------------------+--------+-----------+-------------------+-------------------+-----------+---------+------+-----------+-----+--------+--------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+--------------+--------------+-----------+-----+
and
df1.printSchema()
prints
root
|-- analyze_url: string (nullable = true)
|-- buttons_text: struct (nullable = true)
| |-- done_button: string (nullable = true)
| |-- exit_button: string (nullable = true)
| |-- next_button: string (nullable = true)
| |-- prev_button: string (nullable = true)
|-- category: string (nullable = true)
|-- collect_url: string (nullable = true)
|-- date_created: string (nullable = true)
|-- date_modified: string (nullable = true)
|-- edit_url: string (nullable = true)
|-- folder_id: string (nullable = true)
|-- footer: boolean (nullable = true)
|-- href: string (nullable = true)
|-- id: string (nullable = true)
|-- is_owner: boolean (nullable = true)
|-- language: string (nullable = true)
|-- nickname: string (nullable = true)
|-- page_count: long (nullable = true)
|-- pages: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- description: string (nullable = true)
| | |-- href: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- position: long (nullable = true)
| | |-- question_count: long (nullable = true)
| | |-- questions: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- answers: struct (nullable = true)
| | | | | |-- choices: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- description: string (nullable = true)
| | | | | | | |-- id: string (nullable = true)
| | | | | | | |-- is_na: boolean (nullable = true)
| | | | | | | |-- position: long (nullable = true)
| | | | | | | |-- text: string (nullable = true)
| | | | | | | |-- visible: boolean (nullable = true)
| | | | | | | |-- weight: long (nullable = true)
| | | | | |-- rows: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- id: string (nullable = true)
| | | | | | | |-- position: long (nullable = true)
| | | | | | | |-- text: string (nullable = true)
| | | | | | | |-- visible: boolean (nullable = true)
| | | | |-- display_options: struct (nullable = true)
| | | | | |-- custom_options: struct (nullable = true)
| | | | | | |-- color: string (nullable = true)
| | | | | | |-- option_set: array (nullable = true)
| | | | | | | |-- element: string (containsNull = true)
| | | | | |-- display_subtype: string (nullable = true)
| | | | | |-- display_type: string (nullable = true)
| | | | | |-- left_label: string (nullable = true)
| | | | | |-- left_label_id: string (nullable = true)
| | | | | |-- middle_label: string (nullable = true)
| | | | | |-- middle_label_id: string (nullable = true)
| | | | | |-- right_label: string (nullable = true)
| | | | | |-- right_label_id: string (nullable = true)
| | | | | |-- show_display_number: boolean (nullable = true)
| | | | |-- family: string (nullable = true)
| | | | |-- forced_ranking: boolean (nullable = true)
| | | | |-- headings: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- heading: string (nullable = true)
| | | | |-- href: string (nullable = true)
| | | | |-- id: string (nullable = true)
| | | | |-- layout: string (nullable = true)
| | | | |-- position: long (nullable = true)
| | | | |-- required: struct (nullable = true)
| | | | | |-- amount: string (nullable = true)
| | | | | |-- text: string (nullable = true)
| | | | | |-- type: string (nullable = true)
| | | | |-- sorting: string (nullable = true)
| | | | |-- subtype: string (nullable = true)
| | | | |-- validation: string (nullable = true)
| | | | |-- visible: boolean (nullable = true)
| | |-- title: string (nullable = true)
|-- preview: string (nullable = true)
|-- question_count: long (nullable = true)
|-- response_count: long (nullable = true)
|-- summary_url: string (nullable = true)
|-- title: string (nullable = true)
Upvotes: 1