Reputation: 121
I have a data table similar to the one shown below that I would like to modify using data.table
in RStudio
.
library(data.table)
set.seed(57)
DT <- data.table(ITEM = c("car", "cars", "Car ",
"Tire", "tyre", " breaks",
"brake pads ", " Steering wheel ",
"steering wheel", "door window", "door/window",
"bike", "Bicycle", "chain", "chains "),
METRIC_1 = sample(15:100, size = 15, replace = TRUE),
METRIC_1_CLASS = c("A", "B", "C"),
METRIC_2 = rnorm(15, 38.50, sd = 3),
METRIC_2_CLASS = c("I", "II", "III"))
# DT
#
# > DT
# ITEM METRIC_1 METRIC_1_CLASS METRIC_2 METRIC_2_CLASS
# 1: car 35 A 38.87161 I
# 2: cars 59 B 34.12781 II
# 3: Car 18 C 40.32863 III
# 4: Tire 29 A 38.78030 I
# 5: tyre 78 B 34.12089 II
# 6: breaks 72 C 36.40179 III
# 7: brake pads 99 A 39.32594 I
# 8: Steering wheel 47 B 45.04061 II
# 9: steering wheel 62 C 44.60871 III
# 10: door window 42 A 44.53798 I
# 11: door/window 96 B 40.25646 II
# 12: bike 49 C 39.59810 III
# 13: Bicycle 93 A 41.07434 I
# 14: chain 56 B 36.93384 II
# 15: chains 31 C 40.13403 III
For the 'ITEM' column I would like to
1 -- change the case from lower to upper case
2 -- remove leading or trailing spaces, and replace center spaces with an underscore
3 -- combine data for car
, cars
, and Car
; combine Tire
and tyre
4 -- handle data from the first ten rows together because they are part of one larger group, and do the same for the last 4 columns. For combinations, METRIC_1
is additive and METRIC_2
is additive for the corresponding grouping.
5 -- replace in METRIC_1_CLASS
with A = 100, B = 1000, C = 10000, METRIC_2_CLASS
will be similarly substituted.
I would also like to setkeys to ITEM
and METRIC_1_CLASS
, but I'm not sure if I should do that before or after transforming DT.
This part of the code appears to implement some of the changes
DT[, .(ITEM=toupper(trimws(ITEM)))]
# > DT[, .(ITEM=toupper(trimws(ITEM)))]
# ITEM
# 1: CAR
# 2: CARS
# 3: CAR
# 4: TIRE
# 5: TYRE
# 6: BREAKS
# 7: BRAKE PADS
# 8: STEERING WHEEL
# 9: STEERING WHEEL
# 10: DOOR WINDOW
# 11: DOOR/WINDOW
# 12: BIKE
# 13: BICYCLE
# 14: CHAIN
# 15: CHAINS
The final DT should look something like the following:
set.seed(57)
metric.2 <- DT$METRIC_2
DT <- data.table(ITEM = c("CAR", "TIRE", "BREAKS", "STEERING_WHEEL", "DOOR_WINDOW", "BIKE", "CHAIN"),
METRIC_1 = c(35+59+18, 29+78, 72+99, 47+62, 42+96, 49+93, 56+31),
METRIC_1_CLASS = c(11100, 1100, 10100, 11000, 1100, 10100, 11000),
METRIC_2 = c(sum(metric.2[1:3]), sum(metric.2[4:5]), sum(metric.2[6:7]), sum(metric.2[8:9]), sum(metric.2[10:11]),
sum(metric.2[12:13]), sum(metric.2[14:15])))
DT
# > DT
# ITEM METRIC_1 METRIC_1_CLASS METRIC_2
# 1: CAR 112 11100 113.32805
# 2: TIRE 107 11100 72.90119
# 3: BREAKS 171 11100 75.72772
# 4: STEERING_WHEEL 109 11100 89.64932
# 5: DOOR_WINDOW 138 11100 84.79444
# 6: BIKE 142 11100 80.67244
# 7: CHAIN 87 11100 77.06786
If I start with this DT instead:
set.seed(57)
DT <- data.table(ITEM = c('CAR', 'CAR', 'CAR',
'TIRE', 'TIRE', 'BREAKS',
'BREAKS', 'STEERING_WHEEL', 'STEERING_WHEEL', 'DOOR_WINDOW',
'DOOR_WINDOW', 'BIKE', 'BIKE', 'CHAIN', 'CHAIN'),
METRIC_1 = sample(15:100, size = 15, replace = TRUE),
METRIC_1_CLASS = c(100, 1000, 10000),
METRIC_2 = rnorm(15, 38.50, sd = 3),
METRIC_2_CLASS = c("I", "II", "III")
)
DT
# > DT
# ITEM METRIC_1 METRIC_1_CLASS METRIC_2 METRIC_2_CLASS
# 1: CAR 35 100 38.87161 I
# 2: CAR 59 1000 34.12781 II
# 3: CAR 18 10000 40.32863 III
# 4: TIRE 29 100 38.78030 I
# 5: TIRE 78 1000 34.12089 II
# 6: BREAKS 72 10000 36.40179 III
# 7: BREAKS 99 100 39.32594 I
# 8: STEERING_WHEEL 47 1000 45.04061 II
# 9: STEERING_WHEEL 62 10000 44.60871 III
# 10: DOOR_WINDOW 42 100 44.53798 I
# 11: DOOR_WINDOW 96 1000 40.25646 II
# 12: BIKE 49 10000 39.59810 III
# 13: BIKE 93 100 41.07434 I
# 14: CHAIN 56 1000 36.93384 II
# 15: CHAIN 31 10000 40.13403 III
I could do this:
DT[, lapply(.SD, sum), by = "ITEM", .SDcols = c("METRIC_1", "METRIC_1_CLASS", "METRIC_2")]
# > DT[, lapply(.SD, sum), by = "ITEM", .SDcols = c("METRIC_1", "METRIC_1_CLASS", "METRIC_2")]
# ITEM METRIC_1 METRIC_1_CLASS METRIC_2
# 1: CAR 112 11100 113.32805
# 2: TIRE 107 1100 72.90119
# 3: BREAKS 171 10100 75.72772
# 4: STEERING_WHEEL 109 11000 89.64932
# 5: DOOR_WINDOW 138 1100 84.79444
# 6: BIKE 142 10100 80.67244
# 7: CHAIN 87 11000 77.06786
Upvotes: 0
Views: 103
Reputation: 662
Heavily manual, but it does the job:
# Remove leading spaces
DT[, ITEM := gsub("^\\s*([^\\s].*)$", "\\1", ITEM, perl = TRUE)]
# Remove trailing spaces
DT[, ITEM := gsub("^(.*[^\\s])\\s*$", "\\1", ITEM, perl = TRUE)]
# Substitute center spaces and slash with underscore
DT[, ITEM := gsub("\\s+|/", "_", ITEM, perl = TRUE)]
# UPPER Case
DT[, ITEM := toupper(ITEM)]
# Text cleanup
DT["CARS" == ITEM, ITEM := "CAR"]
DT["TYRE" == ITEM, ITEM := "TIRE"]
DT["BRAKE_PADS" == ITEM, ITEM := "BREAKS"]
DT["CHAINS" == ITEM, ITEM := "CHAIN"]
DT["BICYCLE" == ITEM, ITEM := "BIKE"]
# Numeric substitutions
DT[,
METRIC_1_CLASS :=
ifelse(
"A" == METRIC_1_CLASS
, 100
, ifelse(
"B" == METRIC_1_CLASS
, 1000
, ifelse(
"C" == METRIC_1_CLASS
, 10000
, NA
)))]
Upvotes: 1