In this vignette we will see how to use
mockDrugUtilisation()
function to create mock data. This
function is predominantly used in this package’s unit testing.
For example, one could use the default parameters to create a mock cdm reference like so:
This will then populate several omop tables (for example,
person
, concept
and
visit_occurrence
) and two cohorts in the cdm reference.
cdm$person |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 11
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ person_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ gender_concept_id <dbl> 8507, 8532, 8507, 8507, 8532, 8507, 8507, 8507, 8…
#> $ year_of_birth <int> 2008, 2000, 1970, 2003, 1956, 1986, 1986, 1983, 1…
#> $ day_of_birth <int> 5, 21, 26, 11, 20, 20, 13, 9, 11, 1
#> $ birth_datetime <date> 2008-12-05, 2000-11-21, 1970-11-26, 2003-02-11, 1…
#> $ race_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ ethnicity_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ location_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ provider_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ care_site_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ month_of_birth <int> 12, 11, 11, 2, 4, 1, 2, 12, 3, 5
cdm$person |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 10
cdm$concept |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 10
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ concept_id <dbl> 8505, 8507, 8532, 8576, 8587, 8718, 9202, 9551, 9655,…
#> $ concept_name <chr> "hour", "MALE", "FEMALE", "milligram", "milliliter", …
#> $ domain_id <chr> "Unit", "Gender", "Gender", "Unit", "Unit", "Unit", "…
#> $ vocabulary_id <chr> "UCUM", "Gender", "Gender", "UCUM", "UCUM", "UCUM", "…
#> $ concept_class_id <chr> "Unit", "Gender", "Gender", "Unit", "Unit", "Unit", "…
#> $ standard_concept <chr> "S", "S", "S", "S", "S", "S", "S", "S", "S", NA, "S",…
#> $ concept_code <chr> "h", "M", "F", "mg", "mL", "[iU]", "OP", "10*-3.eq", …
#> $ valid_start_date <chr> "01/01/1970", "01/01/1970", "01/01/1970", "01/01/1970…
#> $ valid_end_date <chr> "31/12/2099", "31/12/2099", "31/12/2099", "31/12/2099…
#> $ invalid_reason <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
cdm$concept |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 38
cdm$visit_occurrence |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 6
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ visit_occurrence_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
#> $ person_id <int> 2, 2, 3, 3, 3, 3, 4, 6, 7, 8, 10, 1, 1, 2, 2, 3,…
#> $ visit_concept_id <dbl> 9202, 9202, 9202, 9202, 9202, 9202, 9202, 9202, …
#> $ visit_start_date <date> 2022-06-10, 2022-05-25, 1997-05-27, 1984-10-29,…
#> $ visit_end_date <date> 2022-06-11, 2022-05-28, 2000-04-19, 2001-03-20,…
#> $ visit_type_concept_id <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
cdm$visit_occurrence |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 47
cdm$cohort1 |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <int> 1, 3, 2, 1, 3, 3, 1, 3, 2, 1
#> $ subject_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ cohort_start_date <date> 2020-09-20, 2022-05-21, 2010-02-10, 2022-01-26, 2…
#> $ cohort_end_date <date> 2021-03-18, 2022-06-05, 2010-07-21, 2022-04-28, 2…
cdm$cohort1 |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 10
cdm$cohort2 |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <int> 1, 1, 2, 1, 2, 3, 1, 3, 1, 1
#> $ subject_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ cohort_start_date <date> 2021-02-10, 2022-06-05, 1991-01-13, 2021-02-09, 2…
#> $ cohort_end_date <date> 2021-02-12, 2022-06-07, 2009-08-28, 2021-07-19, 2…
cdm$cohort2 |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 10
The user can also set the seed to control the randomness within the data.
We now observe that cohort1
has been changed as a result
of this seed:
cdm$cohort1 |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <int> 2, 1, 2, 1, 1, 3, 1, 3, 2, 1
#> $ subject_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ cohort_start_date <date> 2018-06-14, 2019-04-10, 2020-01-28, 2010-07-09, 2…
#> $ cohort_end_date <date> 2018-08-10, 2019-11-19, 2020-02-02, 2015-04-24, 2…
The users can then create mock data in two ways, one is to set the
numberIndividual
parameter and the other is to cusutomise
the tables.
An example of use is as follows:
This will ensure that each of person
,
observation_period
, cohort1
and
cohort2
will have 100 rows.
cdm$person |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 11
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ person_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
#> $ gender_concept_id <dbl> 8507, 8532, 8507, 8507, 8532, 8507, 8507, 8507, 8…
#> $ year_of_birth <int> 1977, 1997, 1982, 1994, 1970, 1980, 1966, 1997, 2…
#> $ day_of_birth <int> 26, 22, 23, 22, 1, 13, 27, 10, 15, 21, 2, 12, 4, …
#> $ birth_datetime <date> 1977-04-26, 1997-12-22, 1982-04-23, 1994-08-22, …
#> $ race_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ ethnicity_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ location_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ provider_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ care_site_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ month_of_birth <int> 4, 12, 4, 8, 5, 3, 1, 2, 1, 10, 3, 5, 2, 12, 3, 4…
cdm$person |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 100
As a consequence of this, the number of rows for other tables such as
visit_occurrence
, condition_occurrence
and
drug_strength
will have more rows compared to the mock data
produced using default settings.
cdm$visit_occurrence |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 6
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ visit_occurrence_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
#> $ person_id <int> 1, 1, 1, 1, 2, 2, 2, 4, 4, 5, 5, 5, 6, 7, 7, 7, …
#> $ visit_concept_id <dbl> 9202, 9202, 9202, 9202, 9202, 9202, 9202, 9202, …
#> $ visit_start_date <date> 1989-11-05, 1988-01-17, 1986-10-19, 1996-12-22,…
#> $ visit_end_date <date> 2000-03-28, 1993-12-19, 1994-11-13, 1998-03-13,…
#> $ visit_type_concept_id <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
As we saw previously, the omop tables are automatically populated in
mockDrugUtilisation()
. However, the user can customise
these tables. For example, to customise drug_exposure
table, one could do the following:
cdm <- mockDrugUtilisation(
drug_exposure = dplyr::tibble(
drug_exposure_id = 1:3,
person_id = c(1, 1, 1),
drug_concept_id = c(2, 3, 4),
drug_exposure_start_date = as.Date(c(
"2000-01-01", "2000-01-10", "2000-02-20"
)),
drug_exposure_end_date = as.Date(c(
"2000-02-10", "2000-03-01", "2000-02-20"
)),
quantity = c(41, 52, 1),
drug_type_concept_id = 0
)
)
cdm$drug_exposure |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 7
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ drug_exposure_id <int> 1, 2, 3
#> $ person_id <dbl> 1, 1, 1
#> $ drug_concept_id <dbl> 2, 3, 4
#> $ drug_exposure_start_date <date> 2000-01-01, 2000-01-10, 2000-02-20
#> $ drug_exposure_end_date <date> 2000-02-10, 2000-03-01, 2000-02-20
#> $ quantity <dbl> 41, 52, 1
#> $ drug_type_concept_id <dbl> 0, 0, 0
However, one needs to be vigilant that the customised omop table is implicitly dependent on other omop tables.
One could also modify other omop tables including
person
, concept
,
concept_ancestor
, drug_strength
,
observation_period
, condition_occurrence
,
observation
, and concept_relationship
.
In a similar fashion, cohort tables can also be customised.
cdm <- mockDrugUtilisation(
observation_period = dplyr::tibble(
observation_period_id = 1,
person_id = 1:2,
observation_period_start_date = as.Date("1900-01-01"),
observation_period_end_date = as.Date("2100-01-01"),
period_type_concept_id = 0
),
cohort1 = dplyr::tibble(
cohort_definition_id = 1,
subject_id = c(1, 1, 2),
cohort_start_date = as.Date(c("2000-01-01", "2001-01-01", "2000-01-01")),
cohort_end_date = as.Date(c("2000-03-01", "2001-03-01", "2000-03-01"))
)
)
cdm$cohort1 |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <dbl> 1, 1, 1
#> $ subject_id <dbl> 1, 1, 2
#> $ cohort_start_date <date> 2000-01-01, 2001-01-01, 2000-01-01
#> $ cohort_end_date <date> 2000-03-01, 2001-03-01, 2000-03-01