Optimize cost calculation on time series data in PostgreSQL with TimescaleDB

Question

I'm working on calculating energy costs from time-series data using PostgreSQL 15 and TimescaleDB (TSL licence, so not all open source features available). The cost calculation is based on energy flow and prices over irregular intervals, and I'm not sure if I'm doing it right and how to improve the performance. I expect a lot of data coming in from streaming, but I don't need to calculate the costs continuosly (can be a nightly batch job).

The goal is to accurately calculate the costs using a formula like sum(prices.flow x prices.price_rp x delta_t)

Here is the schema setup:

CREATE TABLE nodes (
  id integer PRIMARY KEY NOT NULL,
  name character varying NOT NULL
);
CREATE INDEX ix_nodes_id ON nodes USING BTREE (id);
CREATE UNIQUE INDEX ix_nodes_name ON nodes USING BTREE (name);


CREATE TABLE prices (
  node_id integer NOT NULL,
  price_rp double precision NOT NULL,
  in_flow double precision NOT NULL,
  out_flow double precision NOT NULL,
  timestamp timestamp with time zone NOT NULL,
  inserted timestamp with time zone NOT NULL,
  PRIMARY KEY (node_id, timestamp),
  FOREIGN KEY (node_id) REFERENCES public.nodes (id)
);
CREATE INDEX prices_timestamp_idx ON prices USING BTREE (timestamp);


CREATE TABLE node_energy (
  node_id integer NOT NULL,
  status varchar NOT NULL,
  value_wh integer NOT NULL,
  timestamp timestamp with time zone NOT NULL,
  inserted timestamp with time zone NOT NULL,
  PRIMARY KEY (node_id, timestamp),
  FOREIGN KEY (node_id) REFERENCES public.nodes (id)
);
CREATE INDEX node_energy_timestamp_idx ON node_energy USING BTREE (timestamp);

I enabled hypertables:

SELECT create_hypertable('node_energy', 'timestamp', if_not_exists => TRUE);
SELECT create_hypertable('prices', 'timestamp', if_not_exists => TRUE);

Here are sample data filled in:

-- populate nodes
INSERT INTO nodes (id, name) VALUES
  (1, 'Node 1'),
  (2, 'Node 2'),
  (3, 'Node 3'),
  (4, 'Node 4'),
  (5, 'Node 5'),
  (6, 'Node 6'),
  (7, 'Node 7'),
  (8, 'Node 8'),
  (9, 'Node 9'),
  (10, 'Node 10');

--- insert 10 x 1000 energy points
INSERT INTO node_energy (node_id, status, value_wh, timestamp, inserted)
SELECT
  node_id,
  CASE WHEN random() > 0.5 THEN 'SUPPLYING' ELSE 'DEMANDING' END AS status,
  trunc(random() * 1000) AS value_wh,
  timestamp '2024-01-01 00:00:00' + (floor((s - 1) / 30) * interval '1 day') + ((s - 1) % 30) * interval '10 minutes' AS timestamp,
  now() AS inserted
FROM
  generate_series(1, 10) AS node_id,
  generate_series(1, 1000) AS s;

--- insert 10 x 1000 prices
INSERT INTO prices (node_id, price_rp, in_flow, out_flow, timestamp, inserted)
SELECT
  node_id,
  random() * 100 + 10 AS price_rp,  -- Random price between 10 and 150
  random() * 10 AS in_flow,         -- Random in-flow rate
  random() * 10 AS out_flow,        -- Random out-flow rate
  timestamp '2024-01-01 00:00:00' + (floor((s - 1) / 30) * interval '1 day') + ((s - 1) % 30) * interval '10 minutes' AS timestamp,
  now() AS inserted
FROM
  generate_series(1, 10) AS node_id,
  generate_series(1, 1000) AS s;

And here is my drafted query:

WITH cost_data AS (
  
  SELECT
    ne.node_id,
    ne.timestamp AS energy_time,
    p.price_rp,
    p.timestamp AS price_time,
    (ne.value_wh / 1000.0) AS value_kwh, -- converting Wh to kWh
    EXTRACT(EPOCH FROM (ne.timestamp - COALESCE(LAG(ne.timestamp) OVER (PARTITION BY ne.node_id ORDER BY ne.timestamp), ne.timestamp))) AS delta_t,
    CASE WHEN ne.status = 'SUPPLYING' THEN p.out_flow
         WHEN ne.status = 'DEMANDING' THEN p.in_flow
         ELSE 0 END AS flow
FROM
    public.node_energy ne
LEFT JOIN LATERAL
    (SELECT price_rp, timestamp, in_flow, out_flow
     FROM public.prices p
     WHERE p.node_id = ne.node_id AND p.timestamp <= ne.timestamp
     ORDER BY p.timestamp DESC
     LIMIT 1) p ON true
ORDER BY
    ne.timestamp
  
)
SELECT
    node_id,
    SUM(flow * price_rp * delta_t / 3600000) AS total_cost_rp -- converting seconds to hours and applying price rate
FROM
    cost_data
GROUP BY
    node_id
ORDER BY node_id;

DBFiddle

The query so far does, I believe at least, what it should, however it does not scale well. How can I verify that the costs are calculated correctly, and how can I improve the performance? I would like to avoid introducing stream processing if possible, so that's why I'm looking at a SQL solution.

Optimize cost calculation on time series data in PostgreSQL with TimescaleDB

Answers (0)

Related Questions