Split mixed datasets — split_mixed

Split mixed tables, i.e. tables that hold both long data (N values per patient) and short data (one value per patient, duplicated on N lines), into one long table and one short table.

Usage

split_mixed_datasets(
  datasets = get_datasets(),
  id = get_subjid_cols(),
  ...,
  ignore_cols = get_meta_cols(0.95),
  output_code = FALSE,
  verbose = TRUE
)

Arguments

datasets: a dataframe or a list of dataframes to split. Default to all the datasets from .lookup.
id: the patient identifier, probably "subjid". Should be shared by all datasets. Case-insensitive.
...: not used
ignore_cols: columns to ignore when considering a table as long. Default to getOption("edc_cols_crfname", "CRFNAME"). Case-insensitive.
output_code: whether to print the code to explicitly write. Can also be a file path.
verbose: whether to print informations about the process.

Value

a list of the new long and short tables. Use load_list() to load them into the global environment.

Examples

#tm = read_trialmaster("filename.zip", pw="xx")
tm = edc_example()
#> Warning: Option "edc_lookup" has been overwritten.
names(tm)
#>  [1] "enrol"               "db1"                 "db2"                
#>  [4] "db3"                 "short"               "long_pure"          
#>  [7] "long_mixed"          "ae"                  "datetime_extraction"
#> [10] "date_extraction"     ".lookup"            
#load_list(tm)
print(tm$long_mixed) #`val1` and `val2` are long but `val3` is short
#> # A tibble: 100 × 5
#>    subjid crfname                     val1b val2b val3b
#>     <int> <chr>                       <dbl> <dbl> <chr>
#>  1      1 both short and long data  1.33    11.0  B    
#>  2      1 both short and long data -0.869   10.9  B    
#>  3      2 both short and long data  0.0555  10.0  C    
#>  4      2 both short and long data  0.0491  10.1  C    
#>  5      3 both short and long data -0.578    9.28 D    
#>  6      3 both short and long data -0.999    9.80 D    
#>  7      4 both short and long data -0.00243  8.97 E    
#>  8      4 both short and long data  0.656    9.03 E    
#>  9      5 both short and long data  1.48     8.78 F    
#> 10      5 both short and long data -1.91    10.8  F    
#> # ℹ 90 more rows

mixed_data = split_mixed_datasets(tm, id="subjid", verbose=TRUE)
#> ✔ There were 5 short tables:
#>   "enrol", "db1", "db2", "db3", and "short"
#> ✔ There was 1 pure long table:
#>   "long_pure"
#> ✔ There were 2 mixed (short+long) tables:
#>   "long_mixed" and "ae"
#> → Use `EDCimport::load_list()` on the result to get separated long and short
#>   data.
load_list(mixed_data)
print(long_mixed_short) 
#> # A tibble: 50 × 3
#>    subjid crfname                  val3b
#>     <int> <chr>                    <chr>
#>  1      1 both short and long data B    
#>  2      2 both short and long data C    
#>  3      3 both short and long data D    
#>  4      4 both short and long data E    
#>  5      5 both short and long data F    
#>  6      6 both short and long data G    
#>  7      7 both short and long data H    
#>  8      8 both short and long data I    
#>  9      9 both short and long data J    
#> 10     10 both short and long data K    
#> # ℹ 40 more rows
print(long_mixed_long) 
#> # A tibble: 100 × 3
#>    subjid    val1b val2b
#>     <int>    <dbl> <dbl>
#>  1      1  1.33    11.0 
#>  2      1 -0.869   10.9 
#>  3      2  0.0555  10.0 
#>  4      2  0.0491  10.1 
#>  5      3 -0.578    9.28
#>  6      3 -0.999    9.80
#>  7      4 -0.00243  8.97
#>  8      4  0.656    9.03
#>  9      5  1.48     8.78
#> 10      5 -1.91    10.8 
#> # ℹ 90 more rows

#alternatively, get the code and only use the datasets you need
split_mixed_datasets(tm, id="subjid", output_code=TRUE)
#> ✔ There were 5 short tables:
#>   "enrol", "db1", "db2", "db3", and "short"
#> ✔ There was 1 pure long table:
#>   "long_pure"
#> ✔ There were 2 mixed (short+long) tables:
#>   "long_mixed" and "ae"
#> → Copy the following code in your script to separate long and short data:
#> ## `long_mixed` (dim=100x5) ---- 
#> 
#> long_mixed_short = long_mixed %>% 
#>   select(subjid, crfname, val3b) %>% 
#>   group_by(subjid) %>% 
#>   summarise(across(everything(), unify)) #dim=50x3 
#> 
#>  long_mixed_long = long_mixed %>% 
#>   select(subjid, val1b, val2b) #dim=100x3
#> 
#> 
#> ## `ae` (dim=175x6) ---- 
#> 
#> ae_short = ae %>% 
#>   select(subjid, crfname, n_ae) %>% 
#>   group_by(subjid) %>% 
#>   summarise(across(everything(), unify)) #dim=48x3 
#> 
#>  ae_long = ae %>% 
#>   select(subjid, aesoc, aegr, sae) #dim=175x4 
filename = tempfile("mixed_code", fileext=".R")
split_mixed_datasets(tm, id="subjid", output_code=filename)
#> ✔ There were 5 short tables:
#>   "enrol", "db1", "db2", "db3", and "short"
#> ✔ There was 1 pure long table:
#>   "long_pure"
#> ✔ There were 2 mixed (short+long) tables:
#>   "long_mixed" and "ae"
#> → Copy the code from /tmp/RtmpvVIOBV/mixed_code1fd86a67712c.R in your script to
#>   separate long and short data:
#>   `utils::browseURL(/tmp/RtmpvVIOBV/mixed_code1fd86a67712c.R)`
readLines(filename)
#>  [1] "## `long_mixed` (dim=100x5) ---- "                  
#>  [2] ""                                                   
#>  [3] "long_mixed_short = long_mixed %>% "                 
#>  [4] "  select(subjid, crfname, val3b) %>% "              
#>  [5] "  group_by(subjid) %>% "                            
#>  [6] "  summarise(across(everything(), unify)) #dim=50x3 "
#>  [7] ""                                                   
#>  [8] " long_mixed_long = long_mixed %>% "                 
#>  [9] "  select(subjid, val1b, val2b) #dim=100x3"          
#> [10] ""                                                   
#> [11] ""                                                   
#> [12] "## `ae` (dim=175x6) ---- "                          
#> [13] ""                                                   
#> [14] "ae_short = ae %>% "                                 
#> [15] "  select(subjid, crfname, n_ae) %>% "               
#> [16] "  group_by(subjid) %>% "                            
#> [17] "  summarise(across(everything(), unify)) #dim=48x3 "
#> [18] ""                                                   
#> [19] " ae_long = ae %>% "                                 
#> [20] "  select(subjid, aesoc, aegr, sae) #dim=175x4 "