Feng Chen
Feng Chen

Reputation: 2253

How to use SQL to sample 50% of records in each group?

I am new to SQL. My data is like this:

c1  c2  c3  group
1   2   1   1
1   2   2   1
3   5   6   1
2   4   7   2
5   3   4   2
2   1   3   2
6   4   4   2
7   8   3   3
1   2   2   3
3   2   2   3

For each group (indicated by the column group), I want to sample 50% of the records (keeping all the columns) and save the result into a new table.

Upvotes: 0

Views: 264

Answers (1)

osiris_v6
osiris_v6

Reputation: 462

You can use NTILE. If you want each group to be order by other column, just change the ORDER BY from NTILE(2) OVER ( PARTITION BY [t].[group] ORDER BY [t].[group] ) AS rn

IF OBJECT_ID('tempdb..#SampleData') IS NOT NULL
    DROP TABLE #SampleData;
CREATE TABLE #SampleData
    (
      c1 INT ,
      c2 INT ,
      c3 INT ,
      [group] INT
    );

IF OBJECT_ID('tempdb..#OutputData') IS NOT NULL
    DROP TABLE #OutputData;
CREATE TABLE #OutputData
    (
      c1 INT ,
      c2 INT ,
      c3 INT ,
      [group] INT
    );

INSERT  INTO [#SampleData]
        ( [c1], [c2], [c3], [group] )
VALUES  ( 1, -- c1 - int
          2, -- c2 - int
          1, -- c3 - int
          1  -- groupNo - int
          ),
        ( 1, -- c1 - int
          22, -- c2 - int
          12, -- c3 - int
          1  -- groupNo - int
          ),
        ( 144, -- c1 - int
          244, -- c2 - int
          144, -- c3 - int
          1  -- groupNo - int
          ),
        ( 2, -- c1 - int
          23, -- c2 - int
          4, -- c3 - int
          1  -- groupNo - int
          ),
        ( 13, -- c1 - int
          24, -- c2 - int
          14, -- c3 - int
          1  -- groupNo - int
          ),
        ( 16, -- c1 - int
          266, -- c2 - int
          133, -- c3 - int
          2  -- groupNo - int
          ),
          ( 16, -- c1 - int
          266, -- c2 - int
          133, -- c3 - int
          2  -- groupNo - int
          ),
          ( 16, -- c1 - int
          266, -- c2 - int
          133, -- c3 - int
          2  -- groupNo - int
          ),
        ( 123, -- c1 - int
          293, -- c2 - int
          123, -- c3 - int
          2  -- groupNo - int
          ),
        ( 123, -- c1 - int
          223, -- c2 - int
          123, -- c3 - int
          3  -- groupNo - int
          )

;WITH   cte
          AS ( SELECT   [t].[c1] ,
                        [t].[c2] ,
                        [t].[c3] ,
                        [t].[group] ,
                        NTILE(2) OVER ( PARTITION BY [t].[group] ORDER BY [t].[group] ) AS rn
               FROM     [#SampleData] AS [t]
             )
    INSERT  INTO [#OutputData]
            ( [c1] ,
              [c2] ,
              [c3] ,
              [group]
            )
    SELECT  [cte].[c1] ,
            [cte].[c2] ,
            [cte].[c3] ,
            [cte].[group]
    FROM    [cte]
    WHERE   rn = 1;

SELECT  [sd].[c1] ,
        [sd].[c2] ,
        [sd].[c3] ,
        [sd].[group]
FROM    [#OutputData] AS [sd];

Upvotes: 2

Related Questions