Reputation: 509

Clean a dataset that includes some character in each row

I have a dataset like this:

structure(list(`Frequency
Percent` = c("car", "window", "ball", 
"ups"), AI = c("2\n0.00", "3\n0.00", "1\n0.00", "2\n0.00"), BLK = c("0\n0.00", 
"218\n0.29", "48\n0.06", "0\n0.00"), HIANIC = c("1\n0.00", "8\n0.01", 
"4\n0.01", "0\n0.00"), NATRICAN = c("9\n0.01", "7\n0.01", "8\n0.01", 
"0\n0.00"), UNK = c("15\n0.02", "83\n0.11", "36\n0.05", "0\n0.00"
), yy = c("111\n0.15", "897\n1.20", "756\n1.02", "1\n0.00")), class = "data.frame", row.names = c(NA, 
-4L))

How can I split each row by "" and remove n to make two new columns. For instance, car and AI cell (“2\n0.00”), I will have 2 and 0.00 in two different columns.

Upvotes: 1

Answers (5)

GKi

Reputation: 39647

A base one liner:

do.call(data.frame, lapply(DF, \(x) do.call(rbind, strsplit(x, "\n"))))
#  Frequency.Percent AI.1 AI.2 BLK.1 BLK.2 HIANIC.1 HIANIC.2 NATRICAN.1
#1               car    2 0.00     0  0.00        1     0.00          9
#2            window    3 0.00   218  0.29        8     0.01          7
#3              ball    1 0.00    48  0.06        4     0.01          8
#4               ups    2 0.00     0  0.00        0     0.00          0
#  NATRICAN.2 UNK.1 UNK.2 yy.1 yy.2
#1       0.01    15  0.02  111 0.15
#2       0.01    83  0.11  897 1.20
#3       0.01    36  0.05  756 1.02
#4       0.00     0  0.00    1 0.00

Or add also a type convert.

type.convert(do.call(data.frame, lapply(DF, \(x) do.call(rbind, strsplit(x, "\n")))), as.is=TRUE)

Upvotes: 1

akrun

Reputation: 886938

We may use cSplit

library(splitstackshape)
cSplit(df1, 2:ncol(df1), sep = "\n")

-output

   Frequency\nPercent AI_1 AI_2 BLK_1 BLK_2 HIANIC_1 HIANIC_2 NATRICAN_1 NATRICAN_2 UNK_1 UNK_2 yy_1 yy_2
1:                car    2    0     0  0.00        1     0.00          9       0.01    15  0.02  111 0.15
2:             window    3    0   218  0.29        8     0.01          7       0.01    83  0.11  897 1.20
3:               ball    1    0    48  0.06        4     0.01          8       0.01    36  0.05  756 1.02
4:                ups    2    0     0  0.00        0     0.00          0       0.00     0  0.00    1 0.00

Upvotes: 0

user1505631

Reputation: 519

There is also a base R solution:

dat = structure(list(`Frequency
                       Percent` = c("car", "window", "ball", 
                                    "ups"), AI = c("2\n0.00", "3\n0.00", "1\n0.00", "2\n0.00"), BLK = c("0\n0.00", 
                                                                                                        "218\n0.29", "48\n0.06", "0\n0.00"), HIANIC = c("1\n0.00", "8\n0.01", 
                                                                                                                                                        "4\n0.01", "0\n0.00"), NATRICAN = c("9\n0.01", "7\n0.01", "8\n0.01", 
                                                                                                                                                                                            "0\n0.00"), UNK = c("15\n0.02", "83\n0.11", "36\n0.05", "0\n0.00"
                                                                                                                                                                                            ), yy = c("111\n0.15", "897\n1.20", "756\n1.02", "1\n0.00")), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                              -4L))
    transformed = data.frame(Freq_pc = dat[,1])
for(col in seq(2, ncol(dat))){
    transformed = cbind(transformed, t(matrix(unlist(strsplit(dat[,col], "\n")), nrow=2)))
    names(transformed)[c(2*(col-1), 2*(col-1)+1)] = c(paste0(names(dat)[col], "_n"), paste0(names(dat)[col], "_pc"))
}

That results in:

 Freq_pc AI_n AI_pc BLK_n BLK_pc HIANIC_n HIANIC_pc NATRICAN_n NATRICAN_pc UNK_n UNK_pc yy_n yy_pc
1     car    2  0.00     0   0.00        1      0.00          9        0.01    15   0.02  111  0.15
2  window    3  0.00   218   0.29        8      0.01          7        0.01    83   0.11  897  1.20
3    ball    1  0.00    48   0.06        4      0.01          8        0.01    36   0.05  756  1.02
4     ups    2  0.00     0   0.00        0      0.00          0        0.00     0   0.00    1  0.00

Upvotes: 0

jpsmith

Reputation: 17174

One way is to use tidyr::separate in a for loop:

for(i in names(df[,-1])){
  df <- tidyr::separate(df, i, sep = "\n", into = c(i, paste0(i,"_val")))
}

Output:

# Frequency\n Percent AI  AI_val BLK  BLK_val   HIANIC  HIANIC_val  NATRICAN  NATRICAN_val UNK UNK_val  yy yy_val
# 1           car     2   0.00   0    0.00      1       0.00        9         0.01         15    0.02   111   0.15
# 2           window  3   0.00   218  0.29      8       0.01        7         0.01         83    0.11   897   1.20
# 3           ball    1   0.00   48   0.06      4       0.01        8         0.01         36    0.05   756   1.02
# 4           ups     2   0.00   0    0.00      0       0.00        0         0.00         0     0.00   1     0.00

Upvotes: 2

stefan

Reputation: 123783

Using tidyr::separate_rows and tidyr::pivot_wider you could do:

library(tidyr)
library(dplyr)

dat |> 
  mutate(unit = c("n\npct")) |> 
  separate_rows(-1, sep = "\n") |> 
  pivot_wider(names_from = "unit", values_from = -1)
#> # A tibble: 4 × 15
#>   Frequency\n…¹ AI_n  AI_pct BLK_n BLK_pct HIANI…² HIANI…³ NATRI…⁴ NATRI…⁵ UNK_n
#>   <chr>         <chr> <chr>  <chr> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>
#> 1 car           2     0.00   0     0.00    1       0.00    9       0.01    15   
#> 2 window        3     0.00   218   0.29    8       0.01    7       0.01    83   
#> 3 ball          1     0.00   48    0.06    4       0.01    8       0.01    36   
#> 4 ups           2     0.00   0     0.00    0       0.00    0       0.00    0    
#> # … with 5 more variables: UNK_pct <chr>, yy_n <chr>, yy_pct <chr>,
#> #   unit_n <chr>, unit_pct <chr>, and abbreviated variable names
#> #   ¹`Frequency\n                Percent`, ²HIANIC_n, ³HIANIC_pct, ⁴NATRICAN_n,
#> #   ⁵NATRICAN_pct

Upvotes: 1

Clean a dataset that includes some character in each row

Answers (5)

Related Questions