user2943263
user2943263

Reputation: 53

Use Probability for a field in a database is in the set (6, 12, 24)

I want to find all the fields in a database where 80% or more of the values are in the set (6,12,24).

Can I use sys.tables to do this with sys.columns to build a join to then look inside of each field for the probabilty?

Example for were this might apply is in the following list:

6 - Half a Dozen
12 - A Dozen
24 - Two Dozen

So the table would look something like this in procedural language:

select all the tables names in sys.tables and look in each number column in sys.columns for that table where 80% of the values are in the set (6,12,24).

Upvotes: 0

Views: 92

Answers (1)

billinkc
billinkc

Reputation: 61239

Cribbing from my approach on this answer

https://stackoverflow.com/questions/20156747/number-of-null-values-for-every-column-on-ssis/20156981#20156981

You're going to need to look at a similar approach. Here I am using the system metadata to discover what columns are whole numbers (if you need to account for floating point numbers, factor that into the first filter).

SET NOCOUNT ON;

DECLARE
    -- actual query 
    @query nvarchar(max)
    -- templated query
,   @template nvarchar(max);


-- Return column & schema/table combo anywhere
-- there is data in the column that starts like the
-- search key.
SELECT
    @template = N'
    SELECT COUNT(1) AS rc
    , ''<COLUMN_NAME/>'' AS cname
    , ''<SCHEMA/>.<TABLE_NAME/>'' AS tname 
    FROM 
        <SCHEMA/>.<TABLE_NAME/> T 
    WHERE 
        T.<COLUMN_NAME/> IN (6, 12, 24)';


DECLARE 
    CSR CURSOR
FOR
-- Iterate through all the columns
SELECT
    ISC.TABLE_SCHEMA
,   ISC.TABLE_NAME
,   ISC.COLUMN_NAME
,   ISC.IS_NULLABLE
FROM
    INFORMATION_SCHEMA.COLUMNS ISC
WHERE
    (
        ISC.DATA_TYPE IN ('bigint', 'int', 'smallint', 'tinyint')
    );


-- Cursor variables for capturing candidate schemas, tables and columns
DECLARE
    @table_schema sysname
,   @table_name sysname
,   @column_name sysname
,   @nullable varchar(3);

DECLARE
    @RESULTS TABLE
(
    instance_count bigint NOT NULL
,   column_name sysname NOT NULL
,   table_schema nvarchar(500) NOT NULL
);

OPEN
    CSR;

FETCH NEXT
FROM
    CSR
INTO
    @table_schema
,   @table_name
,   @column_name
,   @nullable;

WHILE (@@FETCH_STATUS = 0)
BEGIN
    -- stub in actual names, make 'em safe via quotename function
    SET @query = REPLACE(@template, '<SCHEMA/>', quotename(@table_schema));
    SET @query = REPLACE(@query, '<TABLE_NAME/>', quotename(@table_name));
    SET @query = REPLACE(@query, '<COLUMN_NAME/>', quotename(@column_name));

    BEGIN TRY
        -- Dump results into a table variable
        INSERT INTO
            @RESULTS
        EXECUTE(@query);

    END TRY
    BEGIN CATCH
        -- print failing query
        PRINT @query;

    END CATCH
    FETCH NEXT
    FROM
        CSR
    INTO
        @table_schema
    ,   @table_name
    ,   @column_name
    ,   @nullable;
END
CLOSE CSR;
DEALLOCATE CSR;


WITH ROW_COUNTS AS
(
    SELECT
        s.[Name] as table_schema
    ,   t.[name] as table_name
    ,   SUM(p.rows) as TotalRowCount
    FROM
        sys.schemas s
        LEFT OUTER JOIN 
            sys.tables t
            ON s.schema_id = t.schema_id
        LEFT OUTER JOIN 
            sys.partitions p
            ON t.object_id = p.object_id
        LEFT OUTER JOIN  
            sys.allocation_units a
            ON p.partition_id = a.container_id
    WHERE
        p.index_id  in(0,1) -- 0 heap table , 1 table with clustered index
        AND p.rows is not null
        AND a.type = 1  -- row-data only , not LOB
    GROUP BY 
        s.[Name]
    ,   t.[name]
)
SELECT
    RC.table_schema
,   RC.table_name
,   R.column_name
,   R.instance_count
,   RC.TotalRowCount
    -- ensure we don't divide by zero and perform floating division
,   CAST(R.instance_count / (NULLIF(RC.TotalRowCount, 0) * 1.0) AS decimal(18,2)) AS InstancePercentage
FROM
    ROW_COUNTS AS RC
    INNER JOIN
        @results R
        ON R.table_schema = quotename(RC.table_schema) + '.' + quotename(table_name)
WHERE
    CAST(R.instance_count / (NULLIF(RC.TotalRowCount, 0) * 1.0) AS decimal(18,2)) > .8;

If you drop the filter, then you'll see the percent of values found. It currently NULLs out the percentage for tables with zero rows.

Upvotes: 2

Related Questions