Reputation: 169
I have a table of events generated by devices, with this structure:
class Events(db.Model):
id = db.Column(db.Integer, primary_key=True, autoincrement=True)
timestamp_event = db.Column(db.DateTime, nullable=False, index=True)
device_id = db.Column(db.Integer, db.ForeignKey('devices.id'), nullable=True)
which I have to query joined to:
class Devices(db.Model):
id = db.Column(db.Integer, primary_key=True, autoincrement=True)
dev_name = db.Column(db.String(50))
so I can retrieve Device data for every Event.
I´m doing a ranking of the 20 top max events generated in a single hour. It already works, but as my Events table grows (over 1M rows now) the query gets slower and slower. This is my code. Any ideas on how to optimize the query? Maybe a composite index device.id + timestamp_event? Would that work even if searching for a part of the timedate column?
pkd = db.session.query(db.func.count(Events.id),
db.func.date_format(Events.timestamp_event,'%d/%m %H'),\
Devices.dev_name).select_from(Events).join(Devices)\
.filter(Events.timestamp_event >= (datetime.now() - timedelta(days=peak_days)))\
.group_by(db.func.date_format(Events.timestamp_event,'%Y%M%D%H'))\
.group_by(Events.device_id)\
.order_by(db.func.count(Events.id).desc()).limit(20).all()
Here´s sample output of first 3 rows of the query: Number of events, when (DD/MM HH), and which device:
[(2710, '15/01 16', 'Device 002'),
(2612, '11/01 17', 'Device 033'),
(2133, '13/01 15', 'Device 002'),...]
and here´s SQL generated by SQLAlchemy:
SELECT count(events.id) AS count_1,
date_format(events.timestamp_event,
%(date_format_2)s) AS date_format_1,
devices.id AS devices_id,
devices.dev_name AS devices_dev_name
FROM events
INNER JOIN devices ON devices.id = events.device_id
WHERE events.timestamp_event >= %(timestamp_event_1)s
GROUP BY date_format(events.timestamp_event, %(date_format_3)s), events.device_id
ORDER BY count(events.id) DESC
LIMIT %(param_1)s
Upvotes: 0
Views: 329
Reputation: 9059
# This example is for postgresql.
# I'm not sure what db you are using but the date formatting
# is different.
with Session(engine) as session:
# Use subquery to select top 20 event creating device ids
# for each hour since the beginning of the peak.
hour_fmt = "dd/Mon HH24"
hour_col = func.to_char(Event.created_on, hour_fmt).label('event_hour')
event_count_col = func.count(Event.id).label('event_count')
sub_q = select(
event_count_col,
hour_col,
Event.device_id
).filter(
Event.created_on > get_start_of_peak()
).group_by(
hour_col, Event.device_id
).order_by(
event_count_col.desc()
).limit(
20
).alias()
# Now join in the devices to the top ids to get the names.
results = session.execute(
select(
sub_q.c.event_count,
sub_q.c.event_hour,
Device.name
).join_from(
sub_q,
Device,
sub_q.c.device_id == Device.id
).order_by(
sub_q.c.event_count.desc(),
Device.name
)
).all()
Upvotes: 1