Rami Dridi
Rami Dridi

Reputation: 351

Clickhouse materialized view skip some data

I have this table ,that stors the seconds watched of a video, I should then calculate how many times each second is watched

Record Table

CREATE TABLE Record
(

    `id` UUID,

    `pageUrl` LowCardinality(String),

    `mediaSrc` LowCardinality(String),

    `value` Int32 CODEC(DoubleDelta,
 LZ4),

    `createdAt` DateTime64(3) DEFAULT now(),

    `eventType` LowCardinality(String),

    `duration` Int16
)
ENGINE = MergeTree
ORDER BY id;

this is the view

CREATE MATERIALIZED VIEW MediaPlaytime
(

    `countTimes` UInt64,

    `value` Int32,

    `pageUrl` LowCardinality(String),

    `mediaSrc` LowCardinality(String),

    `dateOrderDay` Date
)
ENGINE = SummingMergeTree
ORDER BY value AS
SELECT
    count(value) AS countTimes,

    value,

    pageUrl,

    mediaSrc,

    toDate(createdAt) AS dateOrderDay
FROM Record AS r
WHERE eventType LIKE '%Media Playtime%'
GROUP BY
    dateOrderDay,

    value,

    pageUrl,

    mediaSrc;

I'm using dataset in this format

less records_1.csv

id,mediaSrc,value,eventType,pageUrl,duration,createdAt
6f2f7766-5697-4987-956f-d8bd26330a00,/video/test?idOfVideo=100,0,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
9544b1c3-5af8-42c4-bc15-26725f42fb81,/video/test?idOfVideo=100,3,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
ec2205af-ef6f-49aa-908d-fc6bf9934ddf,/video/test?idOfVideo=100,6,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
ab2b8aea-f9db-4ca2-9a94-bccde0b4f0b9,/video/test?idOfVideo=100,9,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
4dd7588c-60da-42c2-addd-b1e487fca58b,/video/test?idOfVideo=100,12,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
31ae4260-6c72-4d13-8997-a349ad155ebd,/video/test?idOfVideo=100,15,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
1e1875a8-8581-4ba3-97e9-36b66f0bbd6a,/video/test?idOfVideo=100,18,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
5418e1d4-b34b-4dc5-8827-73188dd24d8d,/video/test?idOfVideo=100,21,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
f13b4cbd-fdc0-4cdd-a1e4-2836c5e120fd,/video/test?idOfVideo=100,24,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
b4c0aa58-74cb-4d47-ac8c-b3506704b689,/video/test?idOfVideo=100,27,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
9fd35ae1-8ebe-4ca7-b14b-968fd54307e3,/video/test?idOfVideo=100,30,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
4321afd7-7534-49fe-adbb-4212fa434767,/video/test?idOfVideo=100,33,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
6013cce8-3bd2-4539-8436-3fe4d8c194aa,/video/test?idOfVideo=100,36,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
8f3d8265-da97-4811-9ee0-3e683e90b034,/video/test?idOfVideo=100,39,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000
0f854806-8255-469e-99bf-af627694b5eb,/video/test?idOfVideo=100,42,Media Playtime,/test/pageurl/1 ,1079,2018-11-15 13:00:00.000

when I query the table to get playtime of certain 'mediaSrc' I use this query :

SELECT 
 count(  value) as countTimes,
 value,
 mediaSrc,
 pageUrl,
    toDate(createdAt) as dateOrderDay 
 FROM test.Record r 
WHERE eventType like '%Media Playtime%'  and 
mediaSrc LIKE '%/video/test?idOfVideo=200%' 
GROUP BY dateOrderDay,pageUrl,mediaSrc,value 

query preview

but when I query the view, I should get the same results (theoretically)

SELECT * from MediaPlaytime where   mediaSrc LIKE '%/video/test?idOfVideo=200%'

enter image description here

some data are not calculated. anything I4m doing wrong?

this is the script I4m using to generate data

const fs = require('fs')


function getRandomInt(min, max) {
    min = Math.ceil(min)
    max = Math.floor(max)
    return Math.floor(Math.random() * (max - min) + min) // The maximum is exclusive and the minimum is inclusive
}

const uuid = require('uuid').v4

const started = Date.now()

function close(stream) {
    stream.close()
    return new Promise((resolve, reject) => {
        stream.once('close', () => {
            resolve()
        })
    })
}
const w = async (index) => {
    const writeUsers = fs.createWriteStream(`records_${index}.csv`)
    writeUsers.write('id,mediaSrc,value,eventType,pageUrl,duration,createdAt\r\n', 'utf8')
    for (let v = index * 100; v < index * 100 + 10; v++) { // 10 videos
        const randomTimeline = getRandomInt(1000, 1800)
        for (let u = 1; u < getRandomInt(3000, 8000); u++) { // between 3000 and 8000 users
            const customEnd = getRandomInt(500, randomTimeline)
            const createdAt = new Date(2018, getRandomInt(1, 12), getRandomInt(1, 28), getRandomInt(1, 24), 0, 0)
                .toISOString()
                .split('T')
                .join(' ')
                .slice(0, -1)
            for (let t = 0; t < customEnd - customEnd % 3; t += 3) { // we should random time read
                const preData = {
                    id: uuid(),
                    mediaSrc: `/video/test?idOfVideo=${v}`,
                    value: t,
                    eventType: 'Media Playtime',
                    pageUrl: `/test/pageurl/${index} `,
                    duration: randomTimeline,
                    createdAt: createdAt
                }
                const data = `${preData.id},${preData.mediaSrc},${preData.value},${preData.eventType},${preData.pageUrl},${preData.duration},${preData.createdAt}\r\n`

                writeUsers.write(data, 'UTF-8')
            }
        }
    }

    writeUsers.end()
    await close(writeUsers)
    return true
}
async function write() {
    for (let index = 1; index < 100; index++) { // 100 page
        const started = Date.now()
        const ok = await w(index)
        if (ok) {
            console.log(`finished  link ${index}`)
            console.log(`consumed time is `, Date.now() - started)
        } else {
            console.log(ok)
        }
    }
}


write()

Upvotes: 0

Views: 552

Answers (1)

Denny Crane
Denny Crane

Reputation: 13350

ENGINE = SummingMergeTree ORDER BY value

This ORDER BY is incorrect. ORDER BY of SummingMergeTree should match GROUPBY of SELECT of Mat.View GROUP BY dateOrderDay, value, pageUrl, mediaSrc

In some sense ORDERBY = dimensions: dateOrderDay, value, pageUrl, mediaSrc

All other columns = measurements: countTimes

https://den-crane.github.io/Everything_you_should_know_about_materialized_views_commented.pdf

Change to SummingMergeTree ORDER BY (dateOrderDay, value, pageUrl, mediaSrc)

CREATE MATERIALIZED VIEW MediaPlaytime
(
    `countTimes` UInt64,
    `value` Int32,
    `pageUrl` LowCardinality(String),
    `mediaSrc` LowCardinality(String),
    `dateOrderDay` Date
)
ENGINE = SummingMergeTree ORDER BY (dateOrderDay, value,  pageUrl, mediaSrc)
AS
SELECT
    count(value) AS countTimes,

    value,

    pageUrl,

    mediaSrc,

    toDate(createdAt) AS dateOrderDay
FROM Record AS r
WHERE eventType LIKE '%Media Playtime%'
GROUP BY
    dateOrderDay,

    value,

    pageUrl,

    mediaSrc;

Upvotes: 1

Related Questions