Create mock data to test DrugUtilisation package

library(DrugUtilisation)

Introduction

In this vignette we will see how to use mockDrugUtilisation() function to create mock data. This function is predominantly used in this package’s unit testing.

For example, one could use the default parameters to create a mock cdm reference like so:

cdm <- mockDrugUtilisation()

This will then populate several omop tables (for example, person, concept and visit_occurrence) and two cohorts in the cdm reference.

cdm$person |> 
  dplyr::glimpse()
#> Rows: ??
#> Columns: 11
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ person_id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ gender_concept_id    <dbl> 8507, 8532, 8507, 8507, 8532, 8507, 8507, 8507, 8…
#> $ year_of_birth        <int> 2008, 2000, 1970, 2003, 1956, 1986, 1986, 1983, 1…
#> $ day_of_birth         <int> 5, 21, 26, 11, 20, 20, 13, 9, 11, 1
#> $ birth_datetime       <date> 2008-12-05, 2000-11-21, 1970-11-26, 2003-02-11, 1…
#> $ race_concept_id      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ ethnicity_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ location_id          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ provider_id          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ care_site_id         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ month_of_birth       <int> 12, 11, 11, 2, 4, 1, 2, 12, 3, 5

cdm$person |>
  dplyr::tally()
#> # Source:   SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#>       n
#>   <dbl>
#> 1    10
cdm$concept |> 
  dplyr::glimpse()
#> Rows: ??
#> Columns: 10
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ concept_id       <dbl> 8505, 8507, 8532, 8576, 8587, 8718, 9202, 9551, 9655,…
#> $ concept_name     <chr> "hour", "MALE", "FEMALE", "milligram", "milliliter", …
#> $ domain_id        <chr> "Unit", "Gender", "Gender", "Unit", "Unit", "Unit", "…
#> $ vocabulary_id    <chr> "UCUM", "Gender", "Gender", "UCUM", "UCUM", "UCUM", "…
#> $ concept_class_id <chr> "Unit", "Gender", "Gender", "Unit", "Unit", "Unit", "…
#> $ standard_concept <chr> "S", "S", "S", "S", "S", "S", "S", "S", "S", NA, "S",…
#> $ concept_code     <chr> "h", "M", "F", "mg", "mL", "[iU]", "OP", "10*-3.eq", …
#> $ valid_start_date <chr> "01/01/1970", "01/01/1970", "01/01/1970", "01/01/1970…
#> $ valid_end_date   <chr> "31/12/2099", "31/12/2099", "31/12/2099", "31/12/2099…
#> $ invalid_reason   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

cdm$concept |>
  dplyr::tally()
#> # Source:   SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#>       n
#>   <dbl>
#> 1    38
cdm$visit_occurrence |> 
  dplyr::glimpse()
#> Rows: ??
#> Columns: 6
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ visit_occurrence_id   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
#> $ person_id             <int> 2, 2, 3, 3, 3, 3, 4, 6, 7, 8, 10, 1, 1, 2, 2, 3,…
#> $ visit_concept_id      <dbl> 9202, 9202, 9202, 9202, 9202, 9202, 9202, 9202, …
#> $ visit_start_date      <date> 2022-06-10, 2022-05-25, 1997-05-27, 1984-10-29,…
#> $ visit_end_date        <date> 2022-06-11, 2022-05-28, 2000-04-19, 2001-03-20,…
#> $ visit_type_concept_id <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

cdm$visit_occurrence |>
  dplyr::tally()
#> # Source:   SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#>       n
#>   <dbl>
#> 1    47
cdm$cohort1 |> 
  dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <int> 1, 3, 2, 1, 3, 3, 1, 3, 2, 1
#> $ subject_id           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ cohort_start_date    <date> 2020-09-20, 2022-05-21, 2010-02-10, 2022-01-26, 2…
#> $ cohort_end_date      <date> 2021-03-18, 2022-06-05, 2010-07-21, 2022-04-28, 2…

cdm$cohort1 |>
  dplyr::tally()
#> # Source:   SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#>       n
#>   <dbl>
#> 1    10
cdm$cohort2 |> 
  dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <int> 1, 1, 2, 1, 2, 3, 1, 3, 1, 1
#> $ subject_id           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ cohort_start_date    <date> 2021-02-10, 2022-06-05, 1991-01-13, 2021-02-09, 2…
#> $ cohort_end_date      <date> 2021-02-12, 2022-06-07, 2009-08-28, 2021-07-19, 2…

cdm$cohort2 |>
  dplyr::tally()
#> # Source:   SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#>       n
#>   <dbl>
#> 1    10

Setting seeds

The user can also set the seed to control the randomness within the data.

cdm <- mockDrugUtilisation(
  seed = 789
)

We now observe that cohort1 has been changed as a result of this seed:

cdm$cohort1 |> 
  dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <int> 2, 1, 2, 1, 1, 3, 1, 3, 2, 1
#> $ subject_id           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ cohort_start_date    <date> 2018-06-14, 2019-04-10, 2020-01-28, 2010-07-09, 2…
#> $ cohort_end_date      <date> 2018-08-10, 2019-11-19, 2020-02-02, 2015-04-24, 2…

The users can then create mock data in two ways, one is to set the numberIndividual parameter and the other is to cusutomise the tables.

Creat mock data using numberIndividual parameter

An example of use is as follows:

cdm <- mockDrugUtilisation(numberIndividual = 100)

This will ensure that each of person, observation_period, cohort1 and cohort2 will have 100 rows.

cdm$person |>
  dplyr::glimpse()
#> Rows: ??
#> Columns: 11
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ person_id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
#> $ gender_concept_id    <dbl> 8507, 8532, 8507, 8507, 8532, 8507, 8507, 8507, 8…
#> $ year_of_birth        <int> 1977, 1997, 1982, 1994, 1970, 1980, 1966, 1997, 2…
#> $ day_of_birth         <int> 26, 22, 23, 22, 1, 13, 27, 10, 15, 21, 2, 12, 4, …
#> $ birth_datetime       <date> 1977-04-26, 1997-12-22, 1982-04-23, 1994-08-22, …
#> $ race_concept_id      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ ethnicity_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ location_id          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ provider_id          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ care_site_id         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ month_of_birth       <int> 4, 12, 4, 8, 5, 3, 1, 2, 1, 10, 3, 5, 2, 12, 3, 4…
cdm$person |>
  dplyr::tally()
#> # Source:   SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#>       n
#>   <dbl>
#> 1   100

As a consequence of this, the number of rows for other tables such as visit_occurrence, condition_occurrence and drug_strength will have more rows compared to the mock data produced using default settings.

cdm$visit_occurrence |>
  dplyr::glimpse()
#> Rows: ??
#> Columns: 6
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ visit_occurrence_id   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
#> $ person_id             <int> 1, 1, 1, 1, 2, 2, 2, 4, 4, 5, 5, 5, 6, 7, 7, 7, …
#> $ visit_concept_id      <dbl> 9202, 9202, 9202, 9202, 9202, 9202, 9202, 9202, …
#> $ visit_start_date      <date> 1989-11-05, 1988-01-17, 1986-10-19, 1996-12-22,…
#> $ visit_end_date        <date> 2000-03-28, 1993-12-19, 1994-11-13, 1998-03-13,…
#> $ visit_type_concept_id <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
cdm$visit_occurrence |>
  dplyr::tally()
#> # Source:   SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#>       n
#>   <dbl>
#> 1   507

Creat mock data by customising tables

Customise omop tables

As we saw previously, the omop tables are automatically populated in mockDrugUtilisation(). However, the user can customise these tables. For example, to customise drug_exposure table, one could do the following:

cdm <- mockDrugUtilisation(
    drug_exposure = dplyr::tibble(
      drug_exposure_id = 1:3,
      person_id = c(1, 1, 1),
      drug_concept_id = c(2, 3, 4),
      drug_exposure_start_date = as.Date(c(
        "2000-01-01", "2000-01-10", "2000-02-20"
      )),
      drug_exposure_end_date = as.Date(c(
        "2000-02-10", "2000-03-01", "2000-02-20"
      )),
      quantity = c(41, 52, 1),
      drug_type_concept_id = 0
    )
  )
cdm$drug_exposure |>
  dplyr::glimpse()
#> Rows: ??
#> Columns: 7
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ drug_exposure_id         <int> 1, 2, 3
#> $ person_id                <dbl> 1, 1, 1
#> $ drug_concept_id          <dbl> 2, 3, 4
#> $ drug_exposure_start_date <date> 2000-01-01, 2000-01-10, 2000-02-20
#> $ drug_exposure_end_date   <date> 2000-02-10, 2000-03-01, 2000-02-20
#> $ quantity                 <dbl> 41, 52, 1
#> $ drug_type_concept_id     <dbl> 0, 0, 0

However, one needs to be vigilant that the customised omop table is implicitly dependent on other omop tables.

One could also modify other omop tables including person, concept, concept_ancestor, drug_strength, observation_period, condition_occurrence, observation, and concept_relationship.

Customise cohorts

In a similar fashion, cohort tables can also be customised.

cdm <- mockDrugUtilisation(
  observation_period = dplyr::tibble(
      observation_period_id = 1,
      person_id = 1:2,
      observation_period_start_date = as.Date("1900-01-01"),
      observation_period_end_date = as.Date("2100-01-01"),
      period_type_concept_id = 0
    ),
    cohort1 = dplyr::tibble(
      cohort_definition_id = 1,
      subject_id = c(1, 1, 2),
      cohort_start_date = as.Date(c("2000-01-01", "2001-01-01", "2000-01-01")),
      cohort_end_date = as.Date(c("2000-03-01", "2001-03-01", "2000-03-01"))
    )
  )
cdm$cohort1 |> 
  dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <dbl> 1, 1, 1
#> $ subject_id           <dbl> 1, 1, 2
#> $ cohort_start_date    <date> 2000-01-01, 2001-01-01, 2000-01-01
#> $ cohort_end_date      <date> 2000-03-01, 2001-03-01, 2000-03-01