33 Reshape

library(data.table)
options(datatable.print.class = TRUE)

33.1 Long to wide using `dcast()`

33.1.1 Example 1: key-value pairs

Using the same example seen in Chapter 22:

dat_long <- data.frame(
    Account_ID = c(8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004, 
            8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004), 
    Age = c(67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671, 67.8017038366664, 42.9198507293701, 
            46.2301756642422, 39.665983196671, 67.8017038366664, 
            42.9198507293701, 46.2301756642422, 39.665983196671, 
            67.8017038366664, 42.9198507293701, 46.2301756642422, 
            39.665983196671), 
    Admission = c("ED", "Planned", "Planned", "ED", "ED", "Planned", 
            "Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", "Planned", 
            "Planned", "ED"), 
    Lab_key = c("RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC", 
            "Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit", 
            "Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"), 
    Lab_value = c(4.63449321082268, 3.34968550627897, 4.27037213597765, 
            4.93897736897793, 8374.22887757195, 7612.37380499927, 
            8759.27855519425, 6972.28096216548, 36.272693147236, 
            40.5716317809522, 39.9888624177955, 39.8786884058422, 
            12.6188444991545, 12.1739747363806, 15.1293426442183, 
            14.8885696185238)
)
dat_long

   Account_ID      Age Admission    Lab_key   Lab_value
1        8001 67.80170        ED        RBC    4.634493
2        8002 42.91985   Planned        RBC    3.349686
3        8003 46.23018   Planned        RBC    4.270372
4        8004 39.66598        ED        RBC    4.938977
5        8001 67.80170        ED        WBC 8374.228878
6        8002 42.91985   Planned        WBC 7612.373805
7        8003 46.23018   Planned        WBC 8759.278555
8        8004 39.66598        ED        WBC 6972.280962
9        8001 67.80170        ED Hematocrit   36.272693
10       8002 42.91985   Planned Hematocrit   40.571632
11       8003 46.23018   Planned Hematocrit   39.988862
12       8004 39.66598        ED Hematocrit   39.878688
13       8001 67.80170        ED Hemoglobin   12.618844
14       8002 42.91985   Planned Hemoglobin   12.173975
15       8003 46.23018   Planned Hemoglobin   15.129343
16       8004 39.66598        ED Hemoglobin   14.888570

data.table’s long to wide procedure is defined with a convenient formula notation:

dat_long_dt <- as.data.table(dat_long)
dat_long2wide_dt <- dcast(dat_long_dt,
                          Account_ID + Age + Admission ~ Lab_key,
                          value.var = "Lab_value")
dat_long2wide_dt

Key: <Account_ID, Age, Admission>
   Account_ID      Age Admission Hematocrit Hemoglobin      RBC      WBC
        <num>    <num>    <char>      <num>      <num>    <num>    <num>
1:       8001 67.80170        ED   36.27269   12.61884 4.634493 8374.229
2:       8002 42.91985   Planned   40.57163   12.17397 3.349686 7612.374
3:       8003 46.23018   Planned   39.98886   15.12934 4.270372 8759.279
4:       8004 39.66598        ED   39.87869   14.88857 4.938977 6972.281

Instead of listing all variables you can use ..., which corresponds to all variables not otherwise mentioned in the formula or in value.var:

dcast(dat_long_dt,
      ... ~ Lab_key,
      value.var = "Lab_value")

Key: <Account_ID, Age, Admission>
   Account_ID      Age Admission Hematocrit Hemoglobin      RBC      WBC
        <num>    <num>    <char>      <num>      <num>    <num>    <num>
1:       8001 67.80170        ED   36.27269   12.61884 4.634493 8374.229
2:       8002 42.91985   Planned   40.57163   12.17397 3.349686 7612.374
3:       8003 46.23018   Planned   39.98886   15.12934 4.270372 8759.279
4:       8004 39.66598        ED   39.87869   14.88857 4.938977 6972.281

33.1.2 Example 2: Timepoints

Another simple synthetic dataset:

dt_long <- data.table(
        ID = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), 
        Timepoint = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
        2L, 3L, 3L, 3L, 3L), levels = c("Timepoint_A", "Timepoint_B", 
        "Timepoint_C"), class = "factor"), 
        Score = c(11L, 12L, 13L, 14L, 21L, 22L, 23L, 24L, 51L, 52L, 53L, 54L)
)
dt_long

       ID   Timepoint Score
    <int>      <fctr> <int>
 1:     1 Timepoint_A    11
 2:     2 Timepoint_A    12
 3:     3 Timepoint_A    13
 4:     4 Timepoint_A    14
 5:     1 Timepoint_B    21
 6:     2 Timepoint_B    22
 7:     3 Timepoint_B    23
 8:     4 Timepoint_B    24
 9:     1 Timepoint_C    51
10:     2 Timepoint_C    52
11:     3 Timepoint_C    53
12:     4 Timepoint_C    54

dt_long

       ID   Timepoint Score
    <int>      <fctr> <int>
 1:     1 Timepoint_A    11
 2:     2 Timepoint_A    12
 3:     3 Timepoint_A    13
 4:     4 Timepoint_A    14
 5:     1 Timepoint_B    21
 6:     2 Timepoint_B    22
 7:     3 Timepoint_B    23
 8:     4 Timepoint_B    24
 9:     1 Timepoint_C    51
10:     2 Timepoint_C    52
11:     3 Timepoint_C    53
12:     4 Timepoint_C    54

dcast(dt_long, ID ~ Timepoint,
      value.var = "Score")

Key: <ID>
      ID Timepoint_A Timepoint_B Timepoint_C
   <int>       <int>       <int>       <int>
1:     1          11          21          51
2:     2          12          22          52
3:     3          13          23          53
4:     4          14          24          54

33.1.3 `dcast()` + aggregate

If your ID ~ Timepoint combination does not define a unique row in your input dataset, you need to specify an aggregate function.

For example, suppose you have four subjects with IDs “A”, “B”, “C”, “D” who had a couple variables measured 3 times in the AM and 3 times in the PM.

dt_long2 <- data.table(ID = rep(LETTERS[1:4], each = 6),
                      Timepoint = rep(c("AM", "PM"), length.out = 24, each = 3),
                      Var1 = rnorm(24, mean = 10),
                      Var2 = rnorm(24, mean = 20))

dt_long2[sample(24, size = 4), Var1 := NA]
dt_long2[sample(24, size = 4), Var2 := NA]
dt_long2

        ID Timepoint      Var1     Var2
    <char>    <char>     <num>    <num>
 1:      A        AM  9.426754       NA
 2:      A        AM        NA 21.05159
 3:      A        AM 10.400182 20.42153
 4:      A        PM  9.118664       NA
 5:      A        PM  9.250790       NA
 6:      A        PM 10.006361 19.72336
 7:      B        AM        NA 20.47440
 8:      B        AM 10.385733       NA
 9:      B        AM  9.984771 20.43867
10:      B        PM 10.958410 20.23529
11:      B        PM  9.859273 20.80292
12:      B        PM  8.801945 21.38419
13:      C        AM 11.025027 21.17170
14:      C        AM  9.000606 19.95727
15:      C        AM  8.536722 21.65135
16:      C        PM 10.343971 19.52186
17:      C        PM        NA 21.00053
18:      C        PM  8.843196 20.22614
19:      D        AM 10.468846 21.22194
20:      D        AM 10.767552 20.67233
21:      D        AM  8.688219 20.74750
22:      D        PM        NA 20.35396
23:      D        PM  8.013317 19.34461
24:      D        PM  9.068445 19.42845
        ID Timepoint      Var1     Var2

If you wanted to convert the above data.table to wide format and get mean AM and PM values using the fun.aggregate argument:

dcast(dt_long2,
      ID ~ Timepoint,
      value.var = c("Var1", "Var2"),
      fun.aggregate = mean, na.rm = TRUE)

Key: <ID>
       ID   Var1_AM  Var1_PM  Var2_AM  Var2_PM
   <char>     <num>    <num>    <num>    <num>
1:      A  9.913468 9.458605 20.73656 19.72336
2:      B 10.185252 9.873209 20.45653 20.80747
3:      C  9.520785 9.593584 20.92677 20.24951
4:      D  9.974872 8.540881 20.88059 19.70901

You can apply multiple aggregating functions by passing a list to fun.aggregate:

dcast(dt_long2,
      ID ~ Timepoint,
      value.var = c("Var1", "Var2"),
      fun.aggregate = list(mean, max, min), na.rm = TRUE)

Key: <ID>
       ID Var1_mean_AM Var1_mean_PM Var2_mean_AM Var2_mean_PM Var1_max_AM
   <char>        <num>        <num>        <num>        <num>       <num>
1:      A     9.913468     9.458605     20.73656     19.72336    10.40018
2:      B    10.185252     9.873209     20.45653     20.80747    10.38573
3:      C     9.520785     9.593584     20.92677     20.24951    11.02503
4:      D     9.974872     8.540881     20.88059     19.70901    10.76755
   Var1_max_PM Var2_max_AM Var2_max_PM Var1_min_AM Var1_min_PM Var2_min_AM
         <num>       <num>       <num>       <num>       <num>       <num>
1:   10.006361    21.05159    19.72336    9.426754    9.118664    20.42153
2:   10.958410    20.47440    21.38419    9.984771    8.801945    20.43867
3:   10.343971    21.65135    21.00053    8.536722    8.843196    19.95727
4:    9.068445    21.22194    20.35396    8.688219    8.013317    20.67233
   Var2_min_PM
         <num>
1:    19.72336
2:    20.23529
3:    19.52186
4:    19.34461

Note how na.rm = TRUE was successfully applied to all aggregating functions

33.2 Wide to long: `melt()`

33.2.1 Example 1

dt_wide <- data.table(
        ID = 1:4,
        Timepoint_A = 11:14,
        Timepoint_B = 21:24,
        Timepoint_C = 51:54)
dt_wide

      ID Timepoint_A Timepoint_B Timepoint_C
   <int>       <int>       <int>       <int>
1:     1          11          21          51
2:     2          12          22          52
3:     3          13          23          53
4:     4          14          24          54

dt_long <- melt(
        dt_wide, 
        id.vars = "ID",
        measure.vars = 2:4, # defaults to all non-id columns
        variable.name = "Timepoint",
        value.name = c("Score"))
dt_long

       ID   Timepoint Score
    <int>      <fctr> <int>
 1:     1 Timepoint_A    11
 2:     2 Timepoint_A    12
 3:     3 Timepoint_A    13
 4:     4 Timepoint_A    14
 5:     1 Timepoint_B    21
 6:     2 Timepoint_B    22
 7:     3 Timepoint_B    23
 8:     4 Timepoint_B    24
 9:     1 Timepoint_C    51
10:     2 Timepoint_C    52
11:     3 Timepoint_C    53
12:     4 Timepoint_C    54

33.2.2 Example 2

Using the same synthetic data as in Chapter 22:

dat_wide_dt <- data.table(
    Account_ID = c(8001, 8002, 8003, 8004),
    Age = rnorm(4, mean = 57, sd = 12),
    RBC = rnorm(4, mean = 4.8, sd = .5),
    WBC = rnorm(4, mean = 7250, sd = 1500),
    Hematocrit = rnorm(4, mean = 40.2, sd = 4),
    Hemoglobin = rnorm(4, mean = 13.6, sd = 1.5),
    Admission = sample(c("ED", "Planned"), size = 4, replace = TRUE)
)
dat_wide_dt

   Account_ID      Age      RBC      WBC Hematocrit Hemoglobin Admission
        <num>    <num>    <num>    <num>      <num>      <num>    <char>
1:       8001 45.36273 5.244354 8361.456   44.14094   13.95509   Planned
2:       8002 50.41712 4.878707 6750.612   38.02299   14.24148   Planned
3:       8003 53.44650 5.559841 7574.076   44.08299   10.54037   Planned
4:       8004 52.62272 5.300168 6317.071   38.10480   13.21905   Planned

dat_wide2long_dt <- melt(dat_wide_dt,
                         id.vars = c(1:2, 7),
                         measure.vars = 3:6,
                         variable.name = "Lab_key",
                         value.name = "Lab_value")
dat_wide2long_dt

    Account_ID      Age Admission    Lab_key   Lab_value
         <num>    <num>    <char>     <fctr>       <num>
 1:       8001 45.36273   Planned        RBC    5.244354
 2:       8002 50.41712   Planned        RBC    4.878707
 3:       8003 53.44650   Planned        RBC    5.559841
 4:       8004 52.62272   Planned        RBC    5.300168
 5:       8001 45.36273   Planned        WBC 8361.456332
 6:       8002 50.41712   Planned        WBC 6750.611609
 7:       8003 53.44650   Planned        WBC 7574.075643
 8:       8004 52.62272   Planned        WBC 6317.070986
 9:       8001 45.36273   Planned Hematocrit   44.140940
10:       8002 50.41712   Planned Hematocrit   38.022995
11:       8003 53.44650   Planned Hematocrit   44.082992
12:       8004 52.62272   Planned Hematocrit   38.104797
13:       8001 45.36273   Planned Hemoglobin   13.955087
14:       8002 50.41712   Planned Hemoglobin   14.241485
15:       8003 53.44650   Planned Hemoglobin   10.540369
16:       8004 52.62272   Planned Hemoglobin   13.219047

If desired, you can set the ID column as they key, which will sort the data.table by its values:

setorder(dat_wide2long_dt, "Account_ID")
dat_wide2long_dt

    Account_ID      Age Admission    Lab_key   Lab_value
         <num>    <num>    <char>     <fctr>       <num>
 1:       8001 45.36273   Planned        RBC    5.244354
 2:       8001 45.36273   Planned        WBC 8361.456332
 3:       8001 45.36273   Planned Hematocrit   44.140940
 4:       8001 45.36273   Planned Hemoglobin   13.955087
 5:       8002 50.41712   Planned        RBC    4.878707
 6:       8002 50.41712   Planned        WBC 6750.611609
 7:       8002 50.41712   Planned Hematocrit   38.022995
 8:       8002 50.41712   Planned Hemoglobin   14.241485
 9:       8003 53.44650   Planned        RBC    5.559841
10:       8003 53.44650   Planned        WBC 7574.075643
11:       8003 53.44650   Planned Hematocrit   44.082992
12:       8003 53.44650   Planned Hemoglobin   10.540369
13:       8004 52.62272   Planned        RBC    5.300168
14:       8004 52.62272   Planned        WBC 6317.070986
15:       8004 52.62272   Planned Hematocrit   38.104797
16:       8004 52.62272   Planned Hemoglobin   13.219047

33.3 Resources

Efficient reshaping using data.tables

33.1 Long to wide using dcast()