James Theobald
James Theobald

Reputation: 121

DT transformation in data.table

I have a data table similar to the one shown below that I would like to modify using data.table in RStudio.

library(data.table)
set.seed(57)
DT <- data.table(ITEM = c("car", "cars", "Car ",
                          "Tire", "tyre", " breaks",
                          "brake  pads ", " Steering  wheel ",
                          "steering wheel", "door window", "door/window",
                          "bike", "Bicycle", "chain", "chains "),
                 METRIC_1 = sample(15:100, size = 15, replace = TRUE),
                 METRIC_1_CLASS = c("A", "B", "C"),
                 METRIC_2 = rnorm(15, 38.50, sd = 3),
                 METRIC_2_CLASS = c("I", "II", "III"))

# DT
# 
# > DT
#                 ITEM METRIC_1 METRIC_1_CLASS METRIC_2 METRIC_2_CLASS
# 1:               car       35              A 38.87161              I
# 2:              cars       59              B 34.12781             II
# 3:              Car        18              C 40.32863            III
# 4:              Tire       29              A 38.78030              I
# 5:              tyre       78              B 34.12089             II
# 6:            breaks       72              C 36.40179            III
# 7:      brake  pads        99              A 39.32594              I
# 8:  Steering  wheel        47              B 45.04061             II
# 9:    steering wheel       62              C 44.60871            III
# 10:       door window       42              A 44.53798              I
# 11:       door/window       96              B 40.25646             II
# 12:              bike       49              C 39.59810            III
# 13:           Bicycle       93              A 41.07434              I
# 14:             chain       56              B 36.93384             II
# 15:           chains        31              C 40.13403            III

For the 'ITEM' column I would like to
1 -- change the case from lower to upper case
2 -- remove leading or trailing spaces, and replace center spaces with an underscore
3 -- combine data for car, cars, and Car; combine Tire and tyre
4 -- handle data from the first ten rows together because they are part of one larger group, and do the same for the last 4 columns. For combinations, METRIC_1 is additive and METRIC_2 is additive for the corresponding grouping.
5 -- replace in METRIC_1_CLASS with A = 100, B = 1000, C = 10000, METRIC_2_CLASS will be similarly substituted.

I would also like to setkeys to ITEM and METRIC_1_CLASS, but I'm not sure if I should do that before or after transforming DT.

This part of the code appears to implement some of the changes

DT[, .(ITEM=toupper(trimws(ITEM)))]

# > DT[, .(ITEM=toupper(trimws(ITEM)))]
#               ITEM
# 1:             CAR
# 2:            CARS
# 3:             CAR
# 4:            TIRE
# 5:            TYRE
# 6:          BREAKS
# 7:     BRAKE  PADS
# 8: STEERING  WHEEL
# 9:  STEERING WHEEL
# 10:     DOOR WINDOW
# 11:     DOOR/WINDOW
# 12:            BIKE
# 13:         BICYCLE
# 14:           CHAIN
# 15:          CHAINS

The final DT should look something like the following:

set.seed(57)
metric.2 <- DT$METRIC_2
DT <- data.table(ITEM = c("CAR", "TIRE", "BREAKS", "STEERING_WHEEL", "DOOR_WINDOW", "BIKE", "CHAIN"),
                 METRIC_1 = c(35+59+18, 29+78, 72+99, 47+62, 42+96, 49+93, 56+31),
                 METRIC_1_CLASS = c(11100, 1100, 10100, 11000, 1100, 10100, 11000),
                 METRIC_2 = c(sum(metric.2[1:3]), sum(metric.2[4:5]), sum(metric.2[6:7]), sum(metric.2[8:9]), sum(metric.2[10:11]),
                              sum(metric.2[12:13]), sum(metric.2[14:15])))
DT

# > DT
#              ITEM METRIC_1 METRIC_1_CLASS  METRIC_2
# 1:            CAR      112          11100 113.32805
# 2:           TIRE      107          11100  72.90119
# 3:         BREAKS      171          11100  75.72772
# 4: STEERING_WHEEL      109          11100  89.64932
# 5:    DOOR_WINDOW      138          11100  84.79444
# 6:           BIKE      142          11100  80.67244
# 7:          CHAIN       87          11100  77.06786

If I start with this DT instead:

set.seed(57)
DT <- data.table(ITEM = c('CAR', 'CAR', 'CAR',
                        'TIRE', 'TIRE', 'BREAKS',
                        'BREAKS', 'STEERING_WHEEL', 'STEERING_WHEEL', 'DOOR_WINDOW',
                        'DOOR_WINDOW', 'BIKE', 'BIKE', 'CHAIN', 'CHAIN'),
                 METRIC_1 = sample(15:100, size = 15, replace = TRUE),
                 METRIC_1_CLASS = c(100, 1000, 10000),
                 METRIC_2 = rnorm(15, 38.50, sd = 3),
                 METRIC_2_CLASS = c("I", "II", "III")
               )
DT

# > DT
#              ITEM METRIC_1 METRIC_1_CLASS METRIC_2 METRIC_2_CLASS
# 1:            CAR       35            100 38.87161              I
# 2:            CAR       59           1000 34.12781             II
# 3:            CAR       18          10000 40.32863            III
# 4:           TIRE       29            100 38.78030              I
# 5:           TIRE       78           1000 34.12089             II
# 6:         BREAKS       72          10000 36.40179            III
# 7:         BREAKS       99            100 39.32594              I
# 8: STEERING_WHEEL       47           1000 45.04061             II
# 9: STEERING_WHEEL       62          10000 44.60871            III
# 10:    DOOR_WINDOW       42            100 44.53798              I    
# 11:    DOOR_WINDOW       96           1000 40.25646             II
# 12:           BIKE       49          10000 39.59810            III
# 13:           BIKE       93            100 41.07434              I
# 14:          CHAIN       56           1000 36.93384             II
# 15:          CHAIN       31          10000 40.13403            III

I could do this:

DT[, lapply(.SD, sum), by = "ITEM", .SDcols = c("METRIC_1", "METRIC_1_CLASS", "METRIC_2")]

# > DT[, lapply(.SD, sum), by = "ITEM", .SDcols = c("METRIC_1", "METRIC_1_CLASS", "METRIC_2")]
#              ITEM METRIC_1 METRIC_1_CLASS  METRIC_2
# 1:            CAR      112          11100 113.32805
# 2:           TIRE      107           1100  72.90119
# 3:         BREAKS      171          10100  75.72772
# 4: STEERING_WHEEL      109          11000  89.64932
# 5:    DOOR_WINDOW      138           1100  84.79444
# 6:           BIKE      142          10100  80.67244
# 7:          CHAIN       87          11000  77.06786

Upvotes: 0

Views: 103

Answers (1)

Alexander Radev
Alexander Radev

Reputation: 662

Heavily manual, but it does the job:

# Remove leading spaces
DT[, ITEM := gsub("^\\s*([^\\s].*)$", "\\1", ITEM, perl = TRUE)]
# Remove trailing spaces
DT[, ITEM := gsub("^(.*[^\\s])\\s*$", "\\1", ITEM, perl = TRUE)]
# Substitute center spaces and slash with underscore
DT[, ITEM := gsub("\\s+|/", "_", ITEM, perl = TRUE)]
# UPPER Case
DT[, ITEM := toupper(ITEM)]

# Text cleanup
DT["CARS" == ITEM, ITEM := "CAR"]
DT["TYRE" == ITEM, ITEM := "TIRE"]
DT["BRAKE_PADS" == ITEM, ITEM := "BREAKS"]
DT["CHAINS" == ITEM, ITEM := "CHAIN"]
DT["BICYCLE" == ITEM, ITEM := "BIKE"]

# Numeric substitutions
DT[,
   METRIC_1_CLASS := 
     ifelse(
       "A" == METRIC_1_CLASS
       , 100
       , ifelse(
         "B" == METRIC_1_CLASS
         , 1000
         , ifelse(
           "C" == METRIC_1_CLASS
           , 10000
           , NA
         )))]

Upvotes: 1

Related Questions