Reputation: 31
I want to import some weather data (temperature, wind speed, ...) that is all formatted in a JSON file into a PostgreSQL 11 table so I can then make queries on that data.
I've been able to manually insert some data into a table but that's only OK because it's a small amount of data and I'm planning on using a LOT more data afterwards. Here is what I've found using the INSERT function: https://datavirtuality.com/blog-json-in-postgresql/.
That's why I've been trying to use the COPY function but no luck so far, even after having read a lot of stuff on different sources on the Internet ...
The JSON file is downloadable there : https://queueresults.meteoblue.com/F2637B90-45BB-4E7A-B47C-C34CD56674B3 (let me know if the file doesn't exist anymore).
I've been able to import the JSON file as text into a table with:
create table temp_json (values text);
copy temp_json from '/home/cae/test.json';
But I don't think that's the best approach to be able to make efficient queries later on ...
I usually run into the following error during my tests:
ERROR: invalid input syntax for type json
DETAIL: The input string ended unexpectedly.
CONTEXT: JSON data, line 1: [
as if I'm not able to parse the JSON file and the array properly within PostgreSQL ...
Thanks for your help !
Edit: Here is the content of the JSON file:
[
{
"geometry": {
"type": "MultiPoint",
"locationNames": [
"59.4°N/24.7°E31.7m",
"59.4°N/24.8°E36.4m"
],
"coordinates": [
[
24.7,
59.4,
31.73
],
[
24.8,
59.4,
36.445
]
]
},
"domain": "NEMS12",
"codes": [
{
"unit": "°C",
"dataPerTimeInterval": [
{
"data": [
[
-0.395,
-0.195,
-0.099999994,
-0.030000001,
-0.060000002,
-0.099999994,
-0.099999994,
0.005,
-0.055,
0.19,
0.48,
0.725,
1.88,
1.88,
1.855,
1.935,
2.1950002,
2.595,
3.3049998,
4.115,
3.37,
2.97,
3.32,
3.5149999,
3.56,
3.44,
3.355,
3.3600001,
3.32,
3.32,
3.4250002,
3.42,
3.3899999,
3.445,
3.3200002,
3.0549998,
4.58,
4.01,
3.02,
2.79,
2.75,
2.76,
2.855,
2.99,
2.96,
2.775,
2.595,
2.4250002
],
[
-0.49,
-0.26,
-0.16,
-0.09,
-0.1,
-0.13,
-0.12,
0.01,
-0.07,
0.17,
0.44,
0.66,
1.84,
1.85,
1.83,
1.9,
2.15,
2.55,
3.27,
4.11,
3.46,
2.96,
3.31,
3.5,
3.55,
3.42,
3.33,
3.34,
3.29,
3.29,
3.43,
3.44,
3.42,
3.52,
3.41,
3.11,
4.53,
4,
3.01,
2.79,
2.76,
2.77,
2.87,
3,
2.93,
2.71,
2.53,
2.38
]
],
"gapFillRatio": 0
}
],
"level": "2 m above gnd",
"aggregation": "none",
"code": 11,
"variable": "Temperature"
}
],
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
],
"timeResolution": "hourly"
},
{
"geometry": {
"coordinates": [
[
24.7,
59.4,
31.73
],
[
24.8,
59.4,
36.445
]
],
"locationNames": [
"59.4°N/24.7°E31.7m",
"59.4°N/24.8°E36.4m"
],
"type": "MultiPoint"
},
"domain": "NEMS12",
"codes": [
{
"unit": "°C",
"aggregation": "none",
"code": 11,
"level": "1000 mb",
"dataPerTimeInterval": [
{
"data": [
[
-0.585,
-0.265,
-0.055,
0.04,
0.044999998,
0.08,
0.11,
0.205,
0.13499999,
0.43,
0.84000003,
1.2,
2.1,
2.33,
2.5,
2.72,
3.1750002,
3.775,
4.915,
5.37,
4.16,
3.795,
4.1949997,
4.41,
4.415,
4.275,
4.1800003,
4.16,
4.0950003,
4.08,
4.185,
4.1,
3.98,
3.575,
3.22,
2.92,
4.395,
3.7649999,
2.895,
2.66,
2.6550002,
2.72,
2.845,
2.955,
2.89,
2.685,
2.54,
2.355
],
[
-0.64,
-0.29,
-0.08,
0.01,
0.03,
0.08,
0.12,
0.24,
0.14,
0.4,
0.8,
1.13,
2.11,
2.34,
2.52,
2.74,
3.19,
3.82,
4.91,
5.45,
4.29,
3.81,
4.19,
4.42,
4.43,
4.28,
4.17,
4.15,
4.08,
4.06,
4.18,
4.12,
4.01,
3.66,
3.31,
2.97,
4.38,
3.79,
2.9,
2.68,
2.68,
2.75,
2.89,
2.99,
2.88,
2.64,
2.43,
2.27
]
],
"gapFillRatio": 0
}
],
"variable": "Temperature"
}
],
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
],
"timeResolution": "hourly"
},
{
"geometry": {
"type": "MultiPoint",
"locationNames": [
"59.4°N/24.7°E31.7m",
"59.4°N/24.8°E36.4m"
],
"coordinates": [
[
24.7,
59.4,
31.73
],
[
24.8,
59.4,
36.445
]
]
},
"domain": "NEMS12",
"codes": [
{
"unit": "°C",
"dataPerTimeInterval": [
{
"data": [
[
-7.0950003,
-6.615,
-4.815,
-3.55,
-2.6750002,
-2.1950002,
-2.695,
-2.87,
-2.1399999,
-0.995,
0.1,
1,
0.335,
0.38,
-0.030000001,
-0.8,
-0.18,
0.575,
1.11,
-0.32999998,
-1.03,
-2.31,
-3.09,
-3.7350001,
-3.93,
-3.905,
-3.92,
-3.71,
-3.625,
-3.195,
-3.7,
-3.32,
-3.72,
-3.915,
-3.93,
-3.605,
-4.315,
-3.8899999,
-3.815,
-3.38,
-3.2150002,
-3.27,
-3.435,
-3.47,
-3.43,
-3.37,
-3.44,
-3.51
],
[
-7.11,
-6.73,
-4.94,
-3.57,
-2.7,
-2.15,
-2.62,
-2.91,
-2.22,
-1.1,
0.03,
0.9,
0.36,
0.37,
0.11,
-0.74,
-0.13,
0.59,
1.19,
-0.19,
-0.95,
-2.18,
-3.08,
-3.68,
-3.97,
-3.94,
-3.93,
-3.69,
-3.63,
-3.27,
-3.7,
-3.32,
-3.68,
-3.9,
-3.97,
-3.6,
-4.29,
-3.92,
-3.8,
-3.37,
-3.24,
-3.28,
-3.42,
-3.44,
-3.39,
-3.35,
-3.37,
-3.44
]
],
"gapFillRatio": 0
}
],
"level": "850 mb",
"code": 11,
"aggregation": "none",
"variable": "Temperature"
}
],
"timeResolution": "hourly",
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
]
},
{
"geometry": {
"type": "MultiPoint",
"locationNames": [
"59.4°N/24.7°E31.7m",
"59.4°N/24.8°E36.4m"
],
"coordinates": [
[
24.7,
59.4,
31.73
],
[
24.8,
59.4,
36.445
]
]
},
"domain": "NEMS12",
"codes": [
{
"unit": "°C",
"dataPerTimeInterval": [
{
"data": [
[
-10.84,
-12,
-10.280001,
-8.865,
-8.5,
-7.7,
-7.5699997,
-7.655,
-8.434999,
-8.844999,
-8.700001,
-7.1549997,
-9.555,
-10.004999,
-7.885,
-8.32,
-8.370001,
-8.915,
-9.53,
-10.225,
-10.934999,
-11.12,
-11.434999,
-11.575,
-11.965,
-11.64,
-12.12,
-12.345,
-12.34,
-12.48,
-12.844999,
-13.174999,
-13.18,
-13.219999,
-13.434999,
-13.305,
-12.775,
-12.745,
-12.79,
-12.75,
-12.690001,
-12.77,
-12.77,
-12.76,
-12.67,
-12.605,
-12.635,
-12.695
],
[
-10.74,
-11.94,
-10.54,
-8.77,
-8.56,
-7.75,
-7.52,
-7.53,
-8.24,
-8.95,
-8.77,
-7.15,
-9.48,
-10.03,
-7.88,
-8.24,
-8.35,
-8.82,
-9.4,
-10.08,
-10.84,
-11.04,
-11.3,
-11.5,
-11.9,
-11.6,
-12.09,
-12.31,
-12.39,
-12.48,
-12.83,
-13.16,
-13.2,
-13.19,
-13.4,
-13.3,
-12.77,
-12.7,
-12.78,
-12.71,
-12.66,
-12.73,
-12.73,
-12.72,
-12.62,
-12.57,
-12.6,
-12.67
]
],
"gapFillRatio": 0
}
],
"code": 11,
"level": "700 mb",
"aggregation": "none",
"variable": "Temperature"
}
],
"timeResolution": "hourly",
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
]
},
{
"geometry": {
"type": "MultiPoint",
"locationNames": [
"59.4°N/24.7°E",
"59.4°N/24.8°E"
],
"coordinates": [
[
24.7,
59.4,
"NaN"
],
[
24.8,
59.4,
"NaN"
]
]
},
"domain": "CAMSGLOBAL",
"codes": [
{
"unit": "",
"dataPerTimeInterval": [
{
"data": [
[
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN"
],
[
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN"
]
],
"gapFillRatio": 0
}
],
"code": 706,
"level": "sfc",
"aggregation": "none",
"variable": "Air Quality Index"
}
],
"timeResolution": "hourly",
"timeIntervals": [
[
"20180101T0000",
"20180101T0100",
"20180101T0200",
"20180101T0300",
"20180101T0400",
"20180101T0500",
"20180101T0600",
"20180101T0700",
"20180101T0800",
"20180101T0900",
"20180101T1000",
"20180101T1100",
"20180101T1200",
"20180101T1300",
"20180101T1400",
"20180101T1500",
"20180101T1600",
"20180101T1700",
"20180101T1800",
"20180101T1900",
"20180101T2000",
"20180101T2100",
"20180101T2200",
"20180101T2300",
"20180102T0000",
"20180102T0100",
"20180102T0200",
"20180102T0300",
"20180102T0400",
"20180102T0500",
"20180102T0600",
"20180102T0700",
"20180102T0800",
"20180102T0900",
"20180102T1000",
"20180102T1100",
"20180102T1200",
"20180102T1300",
"20180102T1400",
"20180102T1500",
"20180102T1600",
"20180102T1700",
"20180102T1800",
"20180102T1900",
"20180102T2000",
"20180102T2100",
"20180102T2200",
"20180102T2300"
]
]
}
]
Upvotes: 3
Views: 17192
Reputation: 11
Thank you to the OP and all previous answers! I am still learning (coming from MongoDB) and wanted to experiment with PG's JSONB querying capabilities. To this effect, I downloaded some sample data from Github (here's the source if you want it).
The answers above lead me to the solution below but the problem I was having with the data was multi-fold:
\copy
command breaks when there are line endings within the data it is "copying" (importing). Errors I received were ERROR: invalid input syntax for type json
along with The input string ended unexpectedly
. To resolve this, use a JSON parsing tool like jq
to transform the JSON file before feeding it to \copy
... more on that in the following points.jq
with the options jq --stream -nc
...
Parse the input in streaming fashion, outputing arrays of path and leaf values (scalars and empty arrays or empty ob‐jects).
--stream
did not work, it caused PG to import things in a way that didn't fit my use case - and I think I was using it wrong.Don´t read any input at all! Instead, the filter is run once using null as the input.
-n
needed to be scrapped because of the root-wrapping array.By default, jq pretty-prints JSON output. Using this option will result in more compact output by instead putting each JSON object on a single line.
).
id
, bio
, name
, etc.), in my example here, it's the some_table.some_column
column. To be clear, I am looking to work with PG in a similar way to MongoDB and so I needed each JSON object to be added to the table on a one row per JSON object basis. Other examples in this and other SO threads (when I managed to get them to work) were importing either the full JSON array as TEXT
into one row or were importing all JSON objects into a single JSONB value, etc... Not what I was trying to do.Ultimately, using jq
in "just the right way" was part of the final solution which also involved using PG's PROGRAM
command from within the PG REPL (psql
PG shell client).
Sample data:
[
{"id": "V59OF92YF627HFY0", "bio": "Donec lobortis eleifend condimentum. Cras dictum dolor lacinia lectus vehicula rutrum. Maecenas quis nisi nunc. Nam tristique feugiat est vitae mollis. Maecenas quis nisi nunc.", "name": "Adeel Solangi", "version": 6.1, "language": "Sindhi"},
{"id": "ENTOCR13RSCLZ6KU", "bio": "Aliquam sollicitudin ante ligula, eget malesuada nibh efficitur et. Pellentesque massa sem, scelerisque sit amet odio id, cursus tempor urna. Etiam congue dignissim volutpat. Vestibulum pharetra libero et velit gravida euismod.", "name": "Afzal Ghaffar", "version": 1.88, "language": "Sindhi"}
]
$ psql -U some_user some_database
- you MUST login to the PG REPL because the \copy
command doesn't work outside the REPL (e.g. from a DB GUI like Intellij or VSCode's DB tools) unless you specify special permissions for the User.psql> \copy some_table(some_column) from PROGRAM 'jq -rc ".[]" /home/USERNAME/sample-data/tmp_user_data.json'
\copy
meta command to load and parse data into the some_column
in the some_table
and specify the data source as program execution on the local system: from PROGRAM
.jq
command breaks down into:
-r / --raw-output
: With this option, if the filter´s result is a string then it will be written directly to standard output rather than being formatted as a JSON string with quotes. I believe this flag was essential because PG (apparently) uses "
as an escape character (I read that somewhere, not 100% sure about it though) and thus, this was part of what was breaking other commands from other threads in SO that mention simply importing JSON from a file (without jq
to transform the data).-c / --compact-output
: As mentioned above, compress the output instead of prettifying it - i.e., each JSON object will be a single line..[]
: This is a jq
query. The .
at the beginning signifies the "root" of the file, the []
specifies something like, "select all array items for output". This is the key to having all nested JSON objects be inserted one-per-row in the some_table.some_column
- which is what I was looking for.from PROGRAM
in the PG Shell command that I'm passing the command in single quotes and that within the single quotes, I use double quotes to pass jq
the "query" I want it to run. I.e.: ... from PROGRAM 'jq -rc ".[]" ...'
.Hope this helps someone out there!
Upvotes: 1
Reputation: 872
Same code of @jjanes with a real, working command line tool.
\copy json_table FROM PROGRAM 'jq --stream -nc -f myfile.json';
Upvotes: 1
Reputation: 31
Removing the ''pretty format'' from the file helped in using the COPY function but it puts the whole content of the file in one row, making it impossible to run a simple SELECT query on an existing column ...
Here is what I used :
CREATE TEMP TABLE target(data jsonb);
copy target from '/home/cae/test_without_new_lines.json';
Upvotes: 0
Reputation: 44227
Given your first example, you could then process it like this to separate the json array into individual objects and stuff them into a table as separate rows:
create table real_json as select value::jsonb from temp_json join lateral json_array_elements(values::json) on true;
However, this depends on the large single json object always being small enough to fit comfortably into an amount of memory you are willing to use, which seems like a dubious proposition. You need a library which does incremental or streaming parsing on the JSON object, returning one 2nd level object at a time and then clearing it from memory one returned. I don't think that PostgreSQL provides such a facility. If you let us know what your favorite programming language is, perhaps someone can propose a specific library.
Alternatively, you could whip up a quick and dirty script that divides the JSON into lines for separate records based on the assumption that the indenting of the "pretty" file is always correct, and so using "^ [{}]" as markers, and then strips out the newlines to reverse the "pretty" formatting so that each record is a single line. If you had such a script, you could then do:
\copy real_json FROM PROGRAM 'unnest_top_array_and_depretty /home/cae/test_without_new_lines.json';
Upvotes: 1