Reputation: 1038
If I have a file with many column, the data are all numbers, how can I know whether a specific column is categorical or quantitative data?. Is there an area of study for this kind of problem? If not, what are some heuristics that can be used to determine?
Some heuristics that I can think of:
some_threshold
, there is higher chance to be categorical data.I am using R, but the question doesn't need to be R specific.
Upvotes: 7
Views: 8661
Reputation: 213
Here is a first cut at an R function using most of those suggestions:
require( "hablar" )
require( "DescTools" )
# unique.p - threshold for unique values as a proportion of total cases
# unique.n - if unique values of x < unique.n then classify as factor
# first.n - if 90% of cases are contained within the first.n levels then classify as a factor
# max.v - if x is an integer and the variance of x is below max.v then classify as a factor
# b.to.f - convert binary variables (x in 0,1) to factors?
is_factor <- function( x, unique.p=0.10, unique.n=(length(x)*unique.p),
first.n=25, max.v=2, b.to.f=FALSE )
{
cat( paste0( "\n----------------- ", deparse(substitute(x)), "\n\n" ) )
# exclude NA, NaN, and Inf
if( is.numeric(x) | is.logical(x) )
{ x <- x[ is.finite(x) ] }
if( is.character(x) )
{
x[ x == "NaN" | x == "Inf" ] <- NA
x <- na.omit(x)
}
n <- length(x)
if( n == 0 )
{
cat( "The variable is empty (all NAs)" )
return(FALSE)
}
cat( paste0( "Valid N (after NA drop) = ", n, "\n" ) )
cat( paste0( "Unique levels/values of x = ", length(unique(x)), "\n" ) )
cat( paste0( "unique.n argument = ", unique.n, "\n" ) )
cat( paste0( "unique.p argument = ", unique.p, "\n\n" ) )
if( "factor" %in% class(x) )
{
cat( "has class FACTOR \n" )
cat( paste0( "Values of x: \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n\n" ) )
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
if( "logical" %in% class(x) )
{
cat( "has class LOGICAL: is NOT a factor \n\n" )
cat( paste0( "Values of x: \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n\n" ) )
return(FALSE)
}
if( any( DescTools::IsDate(x) ) )
{
x.dates <- x[ DescTools::IsDate(x) ]
cat( "x has class DATE: is NOT a factor \n" )
cat( paste0( "Values of x: \n", paste( head( unique(x.dates), 10 ), collapse=",\n" ) ) )
return(FALSE)
}
if( "character" %in% class(x) )
{
cat( "x has class CHARACTER: \n\n" )
# is a logical vector
if( length(unique(x)) == 1 )
{
cat( "All values of x are the same: \n" )
cat( paste0( "Values of x: \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n" ) )
if( b.to.f )
{
cat( "Convert binary to factor is set to TRUE \n\n" )
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
cat( "Convert binary to factor is set to FALSE \n\n" )
return(FALSE)
}
# strings with same length (standardized categories)
# but keep the total levels low so it doesn't flag IDs
is.same <- length( unique( nchar(x) ) ) == 1 & length(unique(x)) < ( n * unique.p )
if( is.same )
{
cat( "All strings have the same number of characters \n\n" )
cat( paste0( "Values of x (first 10): \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n\n" ) )
}
# small number of unique cases
n.unique <- length( unique( x ) )
# small prop of total cases unique
p.unique <- length( unique( x ) ) / n
is.small.unique.n <- n.unique <= unique.n & p.unique <= unique.p
if( is.small.unique.n )
{
cat( "x has a small number & proportion of unique cases\n" )
cat( paste0( "N < ", unique.n, " & prop < ", unique.p, "\n" ) )
cat( paste0( "Number of unique values of x: ", length(unique(x)), "\n" ) )
cat( paste0( "Values of x (first 10): \n", paste( head( sort(unique(x)), 10 ), collapse=",\n" ), "\n\n" ) )
}
# most common levels account for large portion of total
first.n.total <- table(x) %>% sort(desc=T) %>% head( first.n ) %>% sum()
total.p <- first.n.total / n
is.large.p.total <- total.p > 0.90
first.n.levels <- table(x) %>% sort(desc=T) %>% head( first.n ) %>% names()
if( is.large.p.total )
{
cat( paste0( "First ", first.n, " levels accounts for > 90% of total cases \n" ) )
cat( paste0( "First N levels: \n", paste( first.n.levels, collapse=",\n" ), "\n\n" ) )
}
# if it meets any criteria return factor
if( is.same | is.small.unique.n | is.large.p.total )
{
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
}
# only test integers
x <- hablar::retype(x)
if( "numeric" %in% class(x) )
{
cat( "x is non-integer number: NOT a factor \n\n" )
cat( paste0( "Values of x (first 10): \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n\n" ) )
return(FALSE)
}
if( "integer" %in% class(x) )
{
cat( "x has class INTEGER: \n\n" )
# is a logical vector
if( all( x %in% c(0,1) ) | length(unique(x))==1 )
{
cat( "All values of x are 0/1 or a single value: \n" )
cat( paste0( "Values of x: \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n" ) )
if( b.to.f )
{
cat( "Convert binary to factor is set to TRUE \n\n" )
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
cat( "Convert binary to factor is set to FALSE \n\n" )
return(FALSE)
}
# has negative values
if( any( x < 0 ) )
{
cat( "Contains negative integers \n" )
cat( paste0( "Range x: ", range(x), "\n\n" ) )
return(FALSE)
}
# small numer of unique values
n.unique <- length( unique( x ) )
# small prop of total cases unique
p.unique <- length( unique( x ) ) / n
is.small.unique.n <- n.unique <= unique.n & p.unique <= unique.p
if( is.small.unique.n )
{
cat( "x has a small number & proportion of unique cases \n" )
cat( paste0( "unique(x) < ", unique.n, " & unique(x)/length(x) < ", unique.p, " \n" ) )
cat( paste0( "Number of unique values of x: ", length(unique(x)), "\n" ) )
cat( paste0( "Values of x (first 10): \n", paste( head( sort(unique(x)), 10 ), collapse=",\n" ), "\n\n" ) )
}
# starts with 1 and is an approximate sequence
starts.with.one <- min(x) == 1
width.of.range.x <- max(x) - min(x) + 1
is.approx.seq <- length(unique(x)) / width.of.range.x > 0.8
is.seq.from.one <- starts.with.one & is.approx.seq
if( is.seq.from.one )
{ cat( "x is an approximate sequence of integers starting with one \n\n" ) }
# is a true sequence, e.g. 9,10,11,12
is.true.seq <- length(unique(x)) == width.of.range.x &
length(unique(x))/length(x) < unique.p
if( is.true.seq )
{
cat( "x is a true sequence of integers \n" )
cat( paste0( "Values: \n", paste( sort(unique(x)), collapse=",\n" ), "\n\n" ) )
}
# equal intervals between all numbers
is.equal.intervals <- length( unique( x[-1] - x[-length(x)] ) ) == 1
if( is.equal.intervals )
{
cat( "All values of x have equal intervals between them \n" )
cat( paste0( "Values: ", paste( head(sort(unique(x))), collapse="," ), "\n\n" ) )
}
# small variance
is.small.var <- var(x) < max.v
if( is.small.var )
{ cat( paste0( "The variance of x is below ", max.v, "\n\n" ) ) }
# if it meets any criteria return factor
if( is.small.unique.n | is.seq.from.one | is.true.seq | is.equal.intervals )
{
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
}
cat( "There are a large number of unique values: x is NOT a factor \n" )
cat( paste0( "Number of unique values of x: ", length(unique(x)), "\n" ) )
cat( paste0( "Values of x (first 10): \n", paste( head( sort(unique(x)), 10 ), collapse=",\n" ), "\n\n" ) )
return( FALSE )
}
Sample dataset: mtcars:
Potential factors could be:
Cyl and gear get flagged as factors. Carb has 6 unique values or 6/32 = 18% unique proportion score, above the 10% threshold set by unique.p.
Those arguments will be sensitive to the sample size - for example, 50 unique state codes represent a large proportion of total values in a dataset with a couple hundred addresses but the number of states won't grow with size so the proportion of unique values over total cases naturally gets smaller as the dataset grows. These demo datasets are sensitive.
The argument "b.to.f" can be set to TRUE if you want binary variables to be flagged as factors: vs and am in this example.
# mpg Miles/(US) gallon
# cyl Number of cylinders
# disp Displacement (cu.in.)
# hp Gross horsepower
# drat Rear axle ratio
# wt Weight (1000 lbs)
# qsec 1/4 mile time
# vs Engine (0 = V-shaped, 1 = straight)
# am Transmission (0 = automatic, 1 = manual)
# gear Number of forward gears
# carb Number of carburetors
> head( mtcars )
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
lapply( mtcars, is_factor )
----------------- mpg
Valid N (after NA drop) = 32
Unique levels/values of x = 25
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
21,
22.8,
21.4,
18.7,
18.1,
14.3,
24.4,
19.2,
17.8,
16.4
----------------- cyl
Valid N (after NA drop) = 32
Unique levels/values of x = 3
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
x has a small number & proportion of unique cases
unique(x) < 3.2 & unique(x)/length(x) < 0.1
Number of unique values of x: 3
Values of x (first 10):
4,
6,
8
#### IS FACTOR ####
----------------- disp
Valid N (after NA drop) = 32
Unique levels/values of x = 27
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
160,
108,
258,
360,
225,
146.7,
140.8,
167.6,
275.8,
472
----------------- hp
Valid N (after NA drop) = 32
Unique levels/values of x = 22
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
There are a large number of unique values: x is NOT a factor
Number of unique values of x: 22
Values of x (first 10):
52,
62,
65,
66,
91,
93,
95,
97,
105,
109
----------------- drat
Valid N (after NA drop) = 32
Unique levels/values of x = 22
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
3.9,
3.85,
3.08,
3.15,
2.76,
3.21,
3.69,
3.92,
3.07,
2.93
----------------- wt
Valid N (after NA drop) = 32
Unique levels/values of x = 29
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
2.62,
2.875,
2.32,
3.215,
3.44,
3.46,
3.57,
3.19,
3.15,
4.07
----------------- qsec
Valid N (after NA drop) = 32
Unique levels/values of x = 30
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
16.46,
17.02,
18.61,
19.44,
20.22,
15.84,
20,
22.9,
18.3,
18.9
----------------- vs
Valid N (after NA drop) = 32
Unique levels/values of x = 2
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
All values of x are 0/1 or a single value:
Values of x:
0,
1
Convert binary to factor is set to FALSE
----------------- am
Valid N (after NA drop) = 32
Unique levels/values of x = 2
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
All values of x are 0/1 or a single value:
Values of x:
1,
0
Convert binary to factor is set to FALSE
----------------- gear
Valid N (after NA drop) = 32
Unique levels/values of x = 3
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
x has a small number & proportion of unique cases
unique(x) < 3.2 & unique(x)/length(x) < 0.1
Number of unique values of x: 3
Values of x (first 10):
3,
4,
5
x is a true sequence of integers
Values:
3,
4,
5
The variance of x is below 2
#### IS FACTOR ####
----------------- carb
Valid N (after NA drop) = 32
Unique levels/values of x = 6
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
There are a large number of unique values: x is NOT a factor
Number of unique values of x: 6
Values of x (first 10):
1,
2,
3,
4,
6,
8
$mpg
[1] FALSE
$cyl
[1] TRUE
$disp
[1] FALSE
$hp
[1] FALSE
$drat
[1] FALSE
$wt
[1] FALSE
$qsec
[1] FALSE
$vs
[1] FALSE
$am
[1] FALSE
$gear
[1] TRUE
$carb
[1] FALSE
Upvotes: 0
Reputation: 13123
This assumes someone coded the data correctly.
Perhaps you are suggesting the data were not coded or labeled correctly, that it was all entered as numeric and some of it really is categorical. In that case, I do not know how one could tell with any certainty. Categorical data can have decimals places and can be negative.
The question I would ask myself in such a situation is what difference does it make how I treat the data?
If you are interested in the second scenario perhaps you should ask your question on Stack Exchange.
my.data <- read.table(text = '
aa bb cc dd
10 100 1000 1
20 200 2000 2
30 300 3000 3
40 400 4000 4
50 500 5000 5
60 600 6000 6
', header = TRUE, colClasses = c('numeric', 'character', 'numeric', 'character'))
my.data
# one way
str(my.data)
'data.frame': 6 obs. of 4 variables:
$ aa: num 10 20 30 40 50 60
$ bb: chr "100" "200" "300" "400" ...
$ cc: num 1000 2000 3000 4000 5000 6000
$ dd: chr "1" "2" "3" "4" ...
Here is a way to record the information:
my.class <- rep('empty', ncol(my.data))
for(i in 1:ncol(my.data)) {
my.class[i] <- class(my.data[,i])
}
> my.class
[1] "numeric" "character" "numeric" "character"
EDIT
Here is a way to record class
for each column without using a for-loop
:
my.class <- sapply(my.data, class)
Upvotes: 2