Reputation: 11
I want to create a VIEW in my PostgreSQL database. Therefore I try to translate some R code to PostgreSQL code, as I am not firm in the latter one. To translate the code I try to use show_query() from the dbplyr package.
Here the code I try to run at the moment:
tbl(con, sql(
"SELECT DISTINCT
sp.natural_point_nr,
sp.geom,
s.sample_id,
s.sample_ts,
animal_species.animal_species_code,
r.record,
r.record_type,
r.from_season,
r.sex,
r.age,
r.amount
FROM
db_test.project AS p
INNER JOIN
db_test.natural_point AS np ON p.project_id = np.project_id
INNER JOIN
db_test.sampling_point AS sp ON np.sampling_point_id = sp.sampling_point_id
INNER JOIN
db_test.sample AS s ON sp.sampling_point_id = s.sampling_point_id
INNER JOIN
db_test.record AS r ON s.sample_id = r.sample_id
INNER JOIN
db_test.animal_species AS animal_species ON r.animal_species = animal_species.animal_species_id")) |>
# Transform the data
# Pivot longer to gather the relevant columns
pivot_longer(cols = c(record, record_type, from_season, sex, age, amount),
names_to = "attribute",
values_to = "value") %>%
# Create a new column that combines animal_species and attribute
unite("animal_species_attribute", animal_species, attribute, sep = "_") %>%
# Pivot wider to create the desired format
pivot_wider(names_from = "animal_species_attribute", values_from = "value") %>%
# Group by sample_id and sample_ts to ensure we have a single row
group_by(sample_id, sample_ts) %>%
summarise(across(everything(), ~ first(na.omit(.))), .groups = 'drop') %>%
# Optionally, arrange the columns for better readability
arrange(sample_id) |>
show_query()
Here a reproducible example of the table I call with tbl() in the code above:
# Load necessary libraries
library(tibble)
library(lubridate)
# Set seed for reproducibility
set.seed(123)
# Create the tibble
example_tibble <- tibble(
natural_point_nr = sample(400:700, 226, replace = TRUE),
geom = paste0("0101000020E8640000", sample(1:100000000, 226, replace = TRUE)),
sample_id = 1:226,
sample_ts = as.POSIXct("2024-08-14") + days(sample(0:30, 226, replace = TRUE)),
animal_species_code = sample(1:3, 226, replace = TRUE),
record = sample(c(TRUE, FALSE), 226, replace = TRUE),
record_type = sample(c(NA, 1, 3), 226, replace = TRUE),
from_season = sample(c(NA, 1:4), 226, replace = TRUE),
sex = sample(c(NA, 1:3), 226, replace = TRUE),
age = sample(c(NA, 1:4), 226, replace = TRUE),
amount = sample(c(NA, 1:10), 226, replace = TRUE)
)
In the end I run into a number of errors depending on how I try to adjust my code.
Errors I encounter are about:
Edit: Solution for this error (in answer to Simon.S.A. solution below):
mutate(record = case_when(record == TRUE ~ 1,
record == FALSE ~ 0,
.default = NA)
)
first()
:
! first()
is only available in a windowed (mutate()
) context-...
Upvotes: 0
Views: 44
Reputation: 3535
Would filter(!if_all(everything(), is.na))
achieve what you are looking for summarise(across(everything(), ~ first(na.omit(.))))
to achieve?
# Load necessary libraries
library(tidyverse)
library(DBI)
library(dbplyr)
#>
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#>
#> ident, sql
# Set seed for reproducibility
set.seed(123)
# Create the tibble
example_tibble <- tibble(
natural_point_nr = sample(400:700, 226, replace = TRUE),
geom = paste0("0101000020E8640000", sample(1:100000000, 226, replace = TRUE)),
sample_id = 1:226,
sample_ts = as.POSIXct("2024-08-14") + days(sample(0:30, 226, replace = TRUE)),
animal_species_code = sample(1:3, 226, replace = TRUE),
record = sample(c(TRUE, FALSE), 226, replace = TRUE),
record_type = sample(c(NA, 1, 3), 226, replace = TRUE),
from_season = sample(c(NA, 1:4), 226, replace = TRUE),
sex = sample(c(NA, 1:3), 226, replace = TRUE),
age = sample(c(NA, 1:4), 226, replace = TRUE),
amount = sample(c(NA, 1:10), 226, replace = TRUE)
)
db <- dbConnect(duckdb::duckdb())
df <- copy_to(db, example_tibble)
res <-
df |>
# Pivot longer to gather the relevant columns
pivot_longer(cols = c(record, record_type, from_season, sex, age, amount),
names_to = "attribute",
values_to = "value") |>
# Create a new column that combines animal_species and attribute
mutate(animal_species_attribute = str_c(animal_species_code,
attribute, sep = "_")) |>
# Pivot wider to create the desired format
pivot_wider(names_from = "animal_species_attribute",
values_from = "value") |>
# Group by sample_id and sample_ts to ensure we have a single row
group_by(sample_id, sample_ts) |>
filter(!if_all(everything(), is.na)) |>
# Optionally, arrange the columns for better readability
ungroup() |>
arrange(sample_id)
res
#> # Source: SQL [?? x 24]
#> # Database: DuckDB v1.1.3 [root@Darwin 24.3.0:R 4.4.2/:memory:]
#> # Ordered by: sample_id
#> natural_point_nr geom sample_id sample_ts animal_species_code
#> <int> <chr> <int> <dttm> <int>
#> 1 578 010100002… 1 2024-08-22 04:00:00 2
#> 2 578 010100002… 1 2024-08-22 04:00:00 2
#> 3 578 010100002… 1 2024-08-22 04:00:00 2
#> 4 578 010100002… 1 2024-08-22 04:00:00 2
#> 5 578 010100002… 1 2024-08-22 04:00:00 2
#> 6 578 010100002… 1 2024-08-22 04:00:00 2
#> 7 413 010100002… 2 2024-08-15 04:00:00 1
#> 8 413 010100002… 2 2024-08-15 04:00:00 1
#> 9 413 010100002… 2 2024-08-15 04:00:00 1
#> 10 413 010100002… 2 2024-08-15 04:00:00 1
#> # ℹ more rows
#> # ℹ 19 more variables: attribute <chr>, `2_amount` <dbl>,
#> # `1_record_type` <dbl>, `3_amount` <dbl>, `1_sex` <dbl>,
#> # `3_record_type` <dbl>, `1_amount` <dbl>, `2_record` <dbl>,
#> # `3_record` <dbl>, `2_record_type` <dbl>, `1_record` <dbl>,
#> # `3_from_season` <dbl>, `2_sex` <dbl>, `3_age` <dbl>, `3_sex` <dbl>,
#> # `1_from_season` <dbl>, `2_age` <dbl>, `1_age` <dbl>, …
res |>
show_query()
#> <SQL>
#> SELECT
#> natural_point_nr,
#> geom,
#> sample_id,
#> sample_ts,
#> animal_species_code,
#> "attribute",
#> MAX(CASE WHEN (animal_species_attribute = '2_amount') THEN "value" END) AS "2_amount",
#> MAX(CASE WHEN (animal_species_attribute = '1_record_type') THEN "value" END) AS "1_record_type",
#> MAX(CASE WHEN (animal_species_attribute = '3_amount') THEN "value" END) AS "3_amount",
#> MAX(CASE WHEN (animal_species_attribute = '1_sex') THEN "value" END) AS "1_sex",
#> MAX(CASE WHEN (animal_species_attribute = '3_record_type') THEN "value" END) AS "3_record_type",
#> MAX(CASE WHEN (animal_species_attribute = '1_amount') THEN "value" END) AS "1_amount",
#> MAX(CASE WHEN (animal_species_attribute = '2_record') THEN "value" END) AS "2_record",
#> MAX(CASE WHEN (animal_species_attribute = '3_record') THEN "value" END) AS "3_record",
#> MAX(CASE WHEN (animal_species_attribute = '2_record_type') THEN "value" END) AS "2_record_type",
#> MAX(CASE WHEN (animal_species_attribute = '1_record') THEN "value" END) AS "1_record",
#> MAX(CASE WHEN (animal_species_attribute = '3_from_season') THEN "value" END) AS "3_from_season",
#> MAX(CASE WHEN (animal_species_attribute = '2_sex') THEN "value" END) AS "2_sex",
#> MAX(CASE WHEN (animal_species_attribute = '3_age') THEN "value" END) AS "3_age",
#> MAX(CASE WHEN (animal_species_attribute = '3_sex') THEN "value" END) AS "3_sex",
#> MAX(CASE WHEN (animal_species_attribute = '1_from_season') THEN "value" END) AS "1_from_season",
#> MAX(CASE WHEN (animal_species_attribute = '2_age') THEN "value" END) AS "2_age",
#> MAX(CASE WHEN (animal_species_attribute = '1_age') THEN "value" END) AS "1_age",
#> MAX(CASE WHEN (animal_species_attribute = '2_from_season') THEN "value" END) AS "2_from_season"
#> FROM (
#> SELECT
#> q01.*,
#> CONCAT_WS('_', animal_species_code, "attribute") AS animal_species_attribute
#> FROM (
#> SELECT
#> natural_point_nr,
#> geom,
#> sample_id,
#> sample_ts,
#> animal_species_code,
#> 'record' AS "attribute",
#> record AS "value"
#> FROM example_tibble
#>
#> UNION ALL
#>
#> SELECT
#> natural_point_nr,
#> geom,
#> sample_id,
#> sample_ts,
#> animal_species_code,
#> 'record_type' AS "attribute",
#> record_type AS "value"
#> FROM example_tibble
#>
#> UNION ALL
#>
#> SELECT
#> natural_point_nr,
#> geom,
#> sample_id,
#> sample_ts,
#> animal_species_code,
#> 'from_season' AS "attribute",
#> from_season AS "value"
#> FROM example_tibble
#>
#> UNION ALL
#>
#> SELECT
#> natural_point_nr,
#> geom,
#> sample_id,
#> sample_ts,
#> animal_species_code,
#> 'sex' AS "attribute",
#> sex AS "value"
#> FROM example_tibble
#>
#> UNION ALL
#>
#> SELECT
#> natural_point_nr,
#> geom,
#> sample_id,
#> sample_ts,
#> animal_species_code,
#> 'age' AS "attribute",
#> age AS "value"
#> FROM example_tibble
#>
#> UNION ALL
#>
#> SELECT
#> natural_point_nr,
#> geom,
#> sample_id,
#> sample_ts,
#> animal_species_code,
#> 'amount' AS "attribute",
#> amount AS "value"
#> FROM example_tibble
#> ) q01
#> ) q01
#> GROUP BY
#> natural_point_nr,
#> geom,
#> sample_id,
#> sample_ts,
#> animal_species_code,
#> "attribute"
#> HAVING (NOT(((natural_point_nr IS NULL) AND (geom IS NULL) AND (animal_species_code IS NULL) AND ("attribute" IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '2_amount') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '1_record_type') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '3_amount') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '1_sex') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '3_record_type') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '1_amount') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '2_record') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '3_record') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '2_record_type') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '1_record') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '3_from_season') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '2_sex') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '3_age') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '3_sex') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '1_from_season') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '2_age') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '1_age') THEN "value" END)) IS NULL) AND ((MAX(CASE WHEN (animal_species_attribute = '2_from_season') THEN "value" END)) IS NULL))))
#> ORDER BY sample_id
Created on 2025-02-11 with reprex v2.1.1
Upvotes: 0
Reputation: 6941
This error is not caused by show_query
, show_query
is simply the trigger for checking the validity of the SQL translation which gives you the error.
The problem is that you are trying to combine two different data types. In R, conversion between data types can be automatic. For example TRUE/FALSE can be converted to numeric and so they can be combined with integers or floats. However, in SQL this conversion is not automatic.
The problem is most likely occurring in pivot_longer
.
unite
, pivot_wider
, group_by
, summarise
, and arrange
from your example code and seeing if the error still occurs.example_tibble
you can see that record
is of type logical - which in SQL is probably of type boolean - and that record_type
is of type integer - which in SQL could be stored as data type smallint.One solution is to convert all your columns to a consistent data type first. For example:
example_tibble |>
mutate(record = as.numeric(record))
Or if you need more control you can use something like:
example_tibble |>
mutate(record = sql('CAST(record AS INT)'))
Upvotes: 0