Select 30% of each column value

Question

Let's assume we have a table with a column 'A' that has values from 0 to N. And I want to select 30% each rows that have the same value for the column 'A'.

So if I have this:
A|  B
-------
0 hello
0 test
0 hi
1 blah1
1 blah2
1 blah3
1 blah4
1 blah5
1 blah6

Result:
A|  B
-------
0 hello
1 blah1
1 blah4

it could be blah1 or any other blah that is not blah4, and blah4 can be any other blah that is not blah1, basically it could be random or skipping.

By the way, the actual table is huge, talking terabytes, so think about performance.

KM. · Accepted Answer

try something like this:

DECLARE @YourTable table (A int, b varchar(10))
INSERT @YourTable VALUES (0, 'hello') --OP's data
INSERT @YourTable VALUES (0, 'test')
INSERT @YourTable VALUES (0, 'hi')
INSERT @YourTable VALUES (1, 'blah1')
INSERT @YourTable VALUES (1, 'blah2')
INSERT @YourTable VALUES (1, 'blah3')
INSERT @YourTable VALUES (1, 'blah4')
INSERT @YourTable VALUES (1, 'blah5')
INSERT @YourTable VALUES (1, 'blah6')

;WITH NumberedRows AS
(   SELECT 
        A,B,ROW_NUMBER() OVER (PARTITION BY A ORDER BY A,B) AS RowNumber
        FROM @YourTable
)
, GroupCounts AS
(   SELECT
        A,MAX(RowNumber) AS MaxA
        FROM NumberedRows
        GROUP BY A
)
SELECT
    n.a,n.b
    FROM NumberedRows           n
        INNER JOIN GroupCounts  c ON n.A=c.A
    WHERE n.RowNUmber<=(c.MaxA+1)*0.3

OUTPUT:

a           b
----------- ----------
0           hello
1           blah1
1           blah2

(3 row(s) affected)

EDIT based on the great idea in the comment from Andriy M

;WITH NumberedRows AS
(   SELECT 
        A,B,ROW_NUMBER() OVER (PARTITION BY A ORDER BY A,B) AS RowNumber
            ,COUNT(*) OVER (PARTITION BY A) AS TotalOf
        FROM @YourTable
)
SELECT
    n.a,n.b
    FROM NumberedRows            n
    WHERE n.RowNumber<=(n.TotalOf+1)*0.3
    ORDER BY A

OUTPUT:

a           b
----------- ----------
0           hello
1           blah1
1           blah2

(3 row(s) affected)

EDIT here are "random" rows, using Andriy M idea:

DECLARE @YourTable table (A int, b varchar(10))
INSERT @YourTable VALUES (0, 'hello') --OP's data
INSERT @YourTable VALUES (0, 'test')
INSERT @YourTable VALUES (0, 'hi')
INSERT @YourTable VALUES (1, 'blah1')
INSERT @YourTable VALUES (1, 'blah2')
INSERT @YourTable VALUES (1, 'blah3')
INSERT @YourTable VALUES (1, 'blah4')
INSERT @YourTable VALUES (1, 'blah5')
INSERT @YourTable VALUES (1, 'blah6')

;WITH NumberedRows AS
(   SELECT 
        A,B,ROW_NUMBER() OVER (PARTITION BY A ORDER BY newid()) AS RowNumber
        FROM @YourTable
)
, GroupCounts AS (SELECT A,COUNT(A) AS MaxA FROM NumberedRows GROUP BY A)
SELECT
    n.A,n.B
    FROM NumberedRows           n
        INNER JOIN GroupCounts  c ON n.A=c.A
    WHERE n.RowNUmber<=(c.MaxA+1)*0.3
    ORDER BY n.A

OUTPUT:

a           b
----------- ----------
0           hi
1           blah3
1           blah6

(3 row(s) affected)

Select 30% of each column value

Answers (2)

Related Questions