library(data.table)
options(datatable.print.class = TRUE)
33 Reshape
33.1 Long to wide using dcast()
33.1.1 Example 1: key-value pairs
Using the same example seen in Chapter 22:
dat_long <- data.frame(
Account_ID = c(8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004,
8001, 8002, 8003, 8004, 8001, 8002, 8003, 8004),
Age = c(67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671, 67.8017038366664, 42.9198507293701,
46.2301756642422, 39.665983196671, 67.8017038366664,
42.9198507293701, 46.2301756642422, 39.665983196671,
67.8017038366664, 42.9198507293701, 46.2301756642422,
39.665983196671),
Admission = c("ED", "Planned", "Planned", "ED", "ED", "Planned",
"Planned", "ED", "ED", "Planned", "Planned", "ED", "ED", "Planned",
"Planned", "ED"),
Lab_key = c("RBC", "RBC", "RBC", "RBC", "WBC", "WBC", "WBC", "WBC",
"Hematocrit", "Hematocrit", "Hematocrit", "Hematocrit",
"Hemoglobin", "Hemoglobin", "Hemoglobin", "Hemoglobin"),
Lab_value = c(4.63449321082268, 3.34968550627897, 4.27037213597765,
4.93897736897793, 8374.22887757195, 7612.37380499927,
8759.27855519425, 6972.28096216548, 36.272693147236,
40.5716317809522, 39.9888624177955, 39.8786884058422,
12.6188444991545, 12.1739747363806, 15.1293426442183,
14.8885696185238)
)
dat_long
Account_ID Age Admission Lab_key Lab_value
1 8001 67.80170 ED RBC 4.634493
2 8002 42.91985 Planned RBC 3.349686
3 8003 46.23018 Planned RBC 4.270372
4 8004 39.66598 ED RBC 4.938977
5 8001 67.80170 ED WBC 8374.228878
6 8002 42.91985 Planned WBC 7612.373805
7 8003 46.23018 Planned WBC 8759.278555
8 8004 39.66598 ED WBC 6972.280962
9 8001 67.80170 ED Hematocrit 36.272693
10 8002 42.91985 Planned Hematocrit 40.571632
11 8003 46.23018 Planned Hematocrit 39.988862
12 8004 39.66598 ED Hematocrit 39.878688
13 8001 67.80170 ED Hemoglobin 12.618844
14 8002 42.91985 Planned Hemoglobin 12.173975
15 8003 46.23018 Planned Hemoglobin 15.129343
16 8004 39.66598 ED Hemoglobin 14.888570
data.table
’s long to wide procedure is defined with a convenient formula notation:
dat_long_dt <- as.data.table(dat_long)
dat_long2wide_dt <- dcast(dat_long_dt,
Account_ID + Age + Admission ~ Lab_key,
value.var = "Lab_value")
dat_long2wide_dt
Key: <Account_ID, Age, Admission>
Account_ID Age Admission Hematocrit Hemoglobin RBC WBC
<num> <num> <char> <num> <num> <num> <num>
1: 8001 67.80170 ED 36.27269 12.61884 4.634493 8374.229
2: 8002 42.91985 Planned 40.57163 12.17397 3.349686 7612.374
3: 8003 46.23018 Planned 39.98886 15.12934 4.270372 8759.279
4: 8004 39.66598 ED 39.87869 14.88857 4.938977 6972.281
Instead of listing all variables you can use ...
, which corresponds to all variables not otherwise mentioned in the formula or in value.var
:
dcast(dat_long_dt,
... ~ Lab_key,
value.var = "Lab_value")
Key: <Account_ID, Age, Admission>
Account_ID Age Admission Hematocrit Hemoglobin RBC WBC
<num> <num> <char> <num> <num> <num> <num>
1: 8001 67.80170 ED 36.27269 12.61884 4.634493 8374.229
2: 8002 42.91985 Planned 40.57163 12.17397 3.349686 7612.374
3: 8003 46.23018 Planned 39.98886 15.12934 4.270372 8759.279
4: 8004 39.66598 ED 39.87869 14.88857 4.938977 6972.281
33.1.2 Example 2: Timepoints
Another simple synthetic dataset:
dt_long <- data.table(
ID = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L),
Timepoint = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L), levels = c("Timepoint_A", "Timepoint_B",
"Timepoint_C"), class = "factor"),
Score = c(11L, 12L, 13L, 14L, 21L, 22L, 23L, 24L, 51L, 52L, 53L, 54L)
)
dt_long
ID Timepoint Score
<int> <fctr> <int>
1: 1 Timepoint_A 11
2: 2 Timepoint_A 12
3: 3 Timepoint_A 13
4: 4 Timepoint_A 14
5: 1 Timepoint_B 21
6: 2 Timepoint_B 22
7: 3 Timepoint_B 23
8: 4 Timepoint_B 24
9: 1 Timepoint_C 51
10: 2 Timepoint_C 52
11: 3 Timepoint_C 53
12: 4 Timepoint_C 54
dt_long
ID Timepoint Score
<int> <fctr> <int>
1: 1 Timepoint_A 11
2: 2 Timepoint_A 12
3: 3 Timepoint_A 13
4: 4 Timepoint_A 14
5: 1 Timepoint_B 21
6: 2 Timepoint_B 22
7: 3 Timepoint_B 23
8: 4 Timepoint_B 24
9: 1 Timepoint_C 51
10: 2 Timepoint_C 52
11: 3 Timepoint_C 53
12: 4 Timepoint_C 54
dcast(dt_long, ID ~ Timepoint,
value.var = "Score")
Key: <ID>
ID Timepoint_A Timepoint_B Timepoint_C
<int> <int> <int> <int>
1: 1 11 21 51
2: 2 12 22 52
3: 3 13 23 53
4: 4 14 24 54
33.1.3 dcast()
+ aggregate
If your ID ~ Timepoint
combination does not define a unique row in your input dataset, you need to specify an aggregate function.
For example, suppose you have four subjects with IDs “A”, “B”, “C”, “D” who had a couple variables measured 3 times in the AM and 3 times in the PM.
dt_long2 <- data.table(ID = rep(LETTERS[1:4], each = 6),
Timepoint = rep(c("AM", "PM"), length.out = 24, each = 3),
Var1 = rnorm(24, mean = 10),
Var2 = rnorm(24, mean = 20))
dt_long2[sample(24, size = 4), Var1 := NA]
dt_long2[sample(24, size = 4), Var2 := NA]
dt_long2
ID Timepoint Var1 Var2
<char> <char> <num> <num>
1: A AM 9.426754 NA
2: A AM NA 21.05159
3: A AM 10.400182 20.42153
4: A PM 9.118664 NA
5: A PM 9.250790 NA
6: A PM 10.006361 19.72336
7: B AM NA 20.47440
8: B AM 10.385733 NA
9: B AM 9.984771 20.43867
10: B PM 10.958410 20.23529
11: B PM 9.859273 20.80292
12: B PM 8.801945 21.38419
13: C AM 11.025027 21.17170
14: C AM 9.000606 19.95727
15: C AM 8.536722 21.65135
16: C PM 10.343971 19.52186
17: C PM NA 21.00053
18: C PM 8.843196 20.22614
19: D AM 10.468846 21.22194
20: D AM 10.767552 20.67233
21: D AM 8.688219 20.74750
22: D PM NA 20.35396
23: D PM 8.013317 19.34461
24: D PM 9.068445 19.42845
ID Timepoint Var1 Var2
If you wanted to convert the above data.table to wide format and get mean AM and PM values using the fun.aggregate
argument:
Key: <ID>
ID Var1_AM Var1_PM Var2_AM Var2_PM
<char> <num> <num> <num> <num>
1: A 9.913468 9.458605 20.73656 19.72336
2: B 10.185252 9.873209 20.45653 20.80747
3: C 9.520785 9.593584 20.92677 20.24951
4: D 9.974872 8.540881 20.88059 19.70901
You can apply multiple aggregating functions by passing a list to fun.aggregate
:
dcast(dt_long2,
ID ~ Timepoint,
value.var = c("Var1", "Var2"),
fun.aggregate = list(mean, max, min), na.rm = TRUE)
Key: <ID>
ID Var1_mean_AM Var1_mean_PM Var2_mean_AM Var2_mean_PM Var1_max_AM
<char> <num> <num> <num> <num> <num>
1: A 9.913468 9.458605 20.73656 19.72336 10.40018
2: B 10.185252 9.873209 20.45653 20.80747 10.38573
3: C 9.520785 9.593584 20.92677 20.24951 11.02503
4: D 9.974872 8.540881 20.88059 19.70901 10.76755
Var1_max_PM Var2_max_AM Var2_max_PM Var1_min_AM Var1_min_PM Var2_min_AM
<num> <num> <num> <num> <num> <num>
1: 10.006361 21.05159 19.72336 9.426754 9.118664 20.42153
2: 10.958410 20.47440 21.38419 9.984771 8.801945 20.43867
3: 10.343971 21.65135 21.00053 8.536722 8.843196 19.95727
4: 9.068445 21.22194 20.35396 8.688219 8.013317 20.67233
Var2_min_PM
<num>
1: 19.72336
2: 20.23529
3: 19.52186
4: 19.34461
Note how na.rm = TRUE
was successfully applied to all aggregating functions
33.2 Wide to long: melt()
33.2.1 Example 1
dt_wide <- data.table(
ID = 1:4,
Timepoint_A = 11:14,
Timepoint_B = 21:24,
Timepoint_C = 51:54)
dt_wide
ID Timepoint_A Timepoint_B Timepoint_C
<int> <int> <int> <int>
1: 1 11 21 51
2: 2 12 22 52
3: 3 13 23 53
4: 4 14 24 54
dt_long <- melt(
dt_wide,
id.vars = "ID",
measure.vars = 2:4, # defaults to all non-id columns
variable.name = "Timepoint",
value.name = c("Score"))
dt_long
ID Timepoint Score
<int> <fctr> <int>
1: 1 Timepoint_A 11
2: 2 Timepoint_A 12
3: 3 Timepoint_A 13
4: 4 Timepoint_A 14
5: 1 Timepoint_B 21
6: 2 Timepoint_B 22
7: 3 Timepoint_B 23
8: 4 Timepoint_B 24
9: 1 Timepoint_C 51
10: 2 Timepoint_C 52
11: 3 Timepoint_C 53
12: 4 Timepoint_C 54
33.2.2 Example 2
Using the same synthetic data as in Chapter 22:
dat_wide_dt <- data.table(
Account_ID = c(8001, 8002, 8003, 8004),
Age = rnorm(4, mean = 57, sd = 12),
RBC = rnorm(4, mean = 4.8, sd = .5),
WBC = rnorm(4, mean = 7250, sd = 1500),
Hematocrit = rnorm(4, mean = 40.2, sd = 4),
Hemoglobin = rnorm(4, mean = 13.6, sd = 1.5),
Admission = sample(c("ED", "Planned"), size = 4, replace = TRUE)
)
dat_wide_dt
Account_ID Age RBC WBC Hematocrit Hemoglobin Admission
<num> <num> <num> <num> <num> <num> <char>
1: 8001 45.36273 5.244354 8361.456 44.14094 13.95509 Planned
2: 8002 50.41712 4.878707 6750.612 38.02299 14.24148 Planned
3: 8003 53.44650 5.559841 7574.076 44.08299 10.54037 Planned
4: 8004 52.62272 5.300168 6317.071 38.10480 13.21905 Planned
dat_wide2long_dt <- melt(dat_wide_dt,
id.vars = c(1:2, 7),
measure.vars = 3:6,
variable.name = "Lab_key",
value.name = "Lab_value")
dat_wide2long_dt
Account_ID Age Admission Lab_key Lab_value
<num> <num> <char> <fctr> <num>
1: 8001 45.36273 Planned RBC 5.244354
2: 8002 50.41712 Planned RBC 4.878707
3: 8003 53.44650 Planned RBC 5.559841
4: 8004 52.62272 Planned RBC 5.300168
5: 8001 45.36273 Planned WBC 8361.456332
6: 8002 50.41712 Planned WBC 6750.611609
7: 8003 53.44650 Planned WBC 7574.075643
8: 8004 52.62272 Planned WBC 6317.070986
9: 8001 45.36273 Planned Hematocrit 44.140940
10: 8002 50.41712 Planned Hematocrit 38.022995
11: 8003 53.44650 Planned Hematocrit 44.082992
12: 8004 52.62272 Planned Hematocrit 38.104797
13: 8001 45.36273 Planned Hemoglobin 13.955087
14: 8002 50.41712 Planned Hemoglobin 14.241485
15: 8003 53.44650 Planned Hemoglobin 10.540369
16: 8004 52.62272 Planned Hemoglobin 13.219047
If desired, you can set the ID column as they key, which will sort the data.table by its values:
setorder(dat_wide2long_dt, "Account_ID")
dat_wide2long_dt
Account_ID Age Admission Lab_key Lab_value
<num> <num> <char> <fctr> <num>
1: 8001 45.36273 Planned RBC 5.244354
2: 8001 45.36273 Planned WBC 8361.456332
3: 8001 45.36273 Planned Hematocrit 44.140940
4: 8001 45.36273 Planned Hemoglobin 13.955087
5: 8002 50.41712 Planned RBC 4.878707
6: 8002 50.41712 Planned WBC 6750.611609
7: 8002 50.41712 Planned Hematocrit 38.022995
8: 8002 50.41712 Planned Hemoglobin 14.241485
9: 8003 53.44650 Planned RBC 5.559841
10: 8003 53.44650 Planned WBC 7574.075643
11: 8003 53.44650 Planned Hematocrit 44.082992
12: 8003 53.44650 Planned Hemoglobin 10.540369
13: 8004 52.62272 Planned RBC 5.300168
14: 8004 52.62272 Planned WBC 6317.070986
15: 8004 52.62272 Planned Hematocrit 38.104797
16: 8004 52.62272 Planned Hemoglobin 13.219047