Reputation: 351
I have this table ,that stors the seconds watched of a video, I should then calculate how many times each second is watched
CREATE TABLE Record
(
`id` UUID,
`pageUrl` LowCardinality(String),
`mediaSrc` LowCardinality(String),
`value` Int32 CODEC(DoubleDelta,
LZ4),
`createdAt` DateTime64(3) DEFAULT now(),
`eventType` LowCardinality(String),
`duration` Int16
)
ENGINE = MergeTree
ORDER BY id;
this is the view
CREATE MATERIALIZED VIEW MediaPlaytime
(
`countTimes` UInt64,
`value` Int32,
`pageUrl` LowCardinality(String),
`mediaSrc` LowCardinality(String),
`dateOrderDay` Date
)
ENGINE = SummingMergeTree
ORDER BY value AS
SELECT
count(value) AS countTimes,
value,
pageUrl,
mediaSrc,
toDate(createdAt) AS dateOrderDay
FROM Record AS r
WHERE eventType LIKE '%Media Playtime%'
GROUP BY
dateOrderDay,
value,
pageUrl,
mediaSrc;
I'm using dataset in this format
less records_1.csv
id,mediaSrc,value,eventType,pageUrl,duration,createdAt
6f2f7766-5697-4987-956f-d8bd26330a00,/video/test?idOfVideo=100,0,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
9544b1c3-5af8-42c4-bc15-26725f42fb81,/video/test?idOfVideo=100,3,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
ec2205af-ef6f-49aa-908d-fc6bf9934ddf,/video/test?idOfVideo=100,6,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
ab2b8aea-f9db-4ca2-9a94-bccde0b4f0b9,/video/test?idOfVideo=100,9,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
4dd7588c-60da-42c2-addd-b1e487fca58b,/video/test?idOfVideo=100,12,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
31ae4260-6c72-4d13-8997-a349ad155ebd,/video/test?idOfVideo=100,15,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
1e1875a8-8581-4ba3-97e9-36b66f0bbd6a,/video/test?idOfVideo=100,18,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
5418e1d4-b34b-4dc5-8827-73188dd24d8d,/video/test?idOfVideo=100,21,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
f13b4cbd-fdc0-4cdd-a1e4-2836c5e120fd,/video/test?idOfVideo=100,24,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
b4c0aa58-74cb-4d47-ac8c-b3506704b689,/video/test?idOfVideo=100,27,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
9fd35ae1-8ebe-4ca7-b14b-968fd54307e3,/video/test?idOfVideo=100,30,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
4321afd7-7534-49fe-adbb-4212fa434767,/video/test?idOfVideo=100,33,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
6013cce8-3bd2-4539-8436-3fe4d8c194aa,/video/test?idOfVideo=100,36,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
8f3d8265-da97-4811-9ee0-3e683e90b034,/video/test?idOfVideo=100,39,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
0f854806-8255-469e-99bf-af627694b5eb,/video/test?idOfVideo=100,42,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
when I query the table to get playtime of certain 'mediaSrc' I use this query :
SELECT
count( value) as countTimes,
value,
mediaSrc,
pageUrl,
toDate(createdAt) as dateOrderDay
FROM test.Record r
WHERE eventType like '%Media Playtime%' and
mediaSrc LIKE '%/video/test?idOfVideo=200%'
GROUP BY dateOrderDay,pageUrl,mediaSrc,value
but when I query the view, I should get the same results (theoretically)
SELECT * from MediaPlaytime where mediaSrc LIKE '%/video/test?idOfVideo=200%'
some data are not calculated. anything I4m doing wrong?
this is the script I4m using to generate data
const fs = require('fs')
function getRandomInt(min, max) {
min = Math.ceil(min)
max = Math.floor(max)
return Math.floor(Math.random() * (max - min) + min) // The maximum is exclusive and the minimum is inclusive
}
const uuid = require('uuid').v4
const started = Date.now()
function close(stream) {
stream.close()
return new Promise((resolve, reject) => {
stream.once('close', () => {
resolve()
})
})
}
const w = async (index) => {
const writeUsers = fs.createWriteStream(`records_${index}.csv`)
writeUsers.write('id,mediaSrc,value,eventType,pageUrl,duration,createdAt\r\n', 'utf8')
for (let v = index * 100; v < index * 100 + 10; v++) { // 10 videos
const randomTimeline = getRandomInt(1000, 1800)
for (let u = 1; u < getRandomInt(3000, 8000); u++) { // between 3000 and 8000 users
const customEnd = getRandomInt(500, randomTimeline)
const createdAt = new Date(2018, getRandomInt(1, 12), getRandomInt(1, 28), getRandomInt(1, 24), 0, 0)
.toISOString()
.split('T')
.join(' ')
.slice(0, -1)
for (let t = 0; t < customEnd - customEnd % 3; t += 3) { // we should random time read
const preData = {
id: uuid(),
mediaSrc: `/video/test?idOfVideo=${v}`,
value: t,
eventType: 'Media Playtime',
pageUrl: `/test/pageurl/${index} `,
duration: randomTimeline,
createdAt: createdAt
}
const data = `${preData.id},${preData.mediaSrc},${preData.value},${preData.eventType},${preData.pageUrl},${preData.duration},${preData.createdAt}\r\n`
writeUsers.write(data, 'UTF-8')
}
}
}
writeUsers.end()
await close(writeUsers)
return true
}
async function write() {
for (let index = 1; index < 100; index++) { // 100 page
const started = Date.now()
const ok = await w(index)
if (ok) {
console.log(`finished link ${index}`)
console.log(`consumed time is `, Date.now() - started)
} else {
console.log(ok)
}
}
}
write()
Upvotes: 0
Views: 552
Reputation: 13350
ENGINE = SummingMergeTree ORDER BY value
This ORDER BY
is incorrect. ORDER BY
of SummingMergeTree should match GROUPBY of SELECT of Mat.View GROUP BY dateOrderDay, value, pageUrl, mediaSrc
In some sense ORDERBY = dimensions: dateOrderDay, value, pageUrl, mediaSrc
All other columns = measurements: countTimes
https://den-crane.github.io/Everything_you_should_know_about_materialized_views_commented.pdf
Change to SummingMergeTree ORDER BY (dateOrderDay, value, pageUrl, mediaSrc)
CREATE MATERIALIZED VIEW MediaPlaytime
(
`countTimes` UInt64,
`value` Int32,
`pageUrl` LowCardinality(String),
`mediaSrc` LowCardinality(String),
`dateOrderDay` Date
)
ENGINE = SummingMergeTree ORDER BY (dateOrderDay, value, pageUrl, mediaSrc)
AS
SELECT
count(value) AS countTimes,
value,
pageUrl,
mediaSrc,
toDate(createdAt) AS dateOrderDay
FROM Record AS r
WHERE eventType LIKE '%Media Playtime%'
GROUP BY
dateOrderDay,
value,
pageUrl,
mediaSrc;
Upvotes: 1