CodelistGenerator

Working with the OMOP CDM vocabulary tables

2025-06-29

CodelistGenerator

Reference to the CDM vocabulary tables

library(duckdb)
library(CDMConnector)
library(dplyr)
library(tidyr)
library(DBI)
library(here)

datasetName <- "GiBleed"
dbdir <- here(paste0(datasetName, ".duckdb"))
con <- dbConnect(drv = duckdb(dbdir = dbdir))

cdm <- cdmFromCon(
  con = con,
  cdmSchema = "main",
  writeSchema = "main",
  writePrefix = "cg_",
  cdmName = datasetName
)

cdm

── # OMOP CDM reference (duckdb) of GiBleed ──────────────────────────────────────────────────────────────────────────────────────

• omop tables: person, observation_period, visit_occurrence, visit_detail, condition_occurrence, drug_exposure,
procedure_occurrence, device_exposure, measurement, observation, death, note, note_nlp, specimen, fact_relationship, location,
care_site, provider, payer_plan_period, cost, drug_era, dose_era, condition_era, metadata, cdm_source, concept, vocabulary,
domain, concept_class, concept_relationship, relationship, concept_synonym, concept_ancestor, source_to_concept_map,
drug_strength

• cohort tables: -

• achilles tables: -

• other tables: -

Reference to the CDM vocabulary tables

Note, Eunomia doesn´t have a full set of vocabularies:

cdm$concept %>%
  tally() %>%
  pull()

[1] 444

We’ll create a mock to show some of the functions where Eunomia won’t work because of its partial vocabularies

library(CodelistGenerator)
cdm_mock <- mockVocabRef()
cdm_mock

CDM vocabulary tables

https://athena.ohdsi.org

CDM vocabulary tables

cdm$concept %>% glimpse()

Rows: ??
Columns: 10
Database: DuckDB v1.3.1 [unknown@Linux 6.11.0-1015-azure:R 4.5.1//home/runner/work/RealWorldEvidenceSummerSchool2025/RealWorldEvidenceSummerSchool2025/GiBleed.duckdb]
$ concept_id       <int> 35208414, 1118088, 40213201, 1557272, 4336464, 4295880, 3020630, 19129655, 44923712, 1569708, 40213216,…
$ concept_name     <chr> "Gastrointestinal hemorrhage, unspecified", "celecoxib 200 MG Oral Capsule [Celebrex]", "pneumococcal p…
$ domain_id        <chr> "Condition", "Drug", "Drug", "Drug", "Procedure", "Procedure", "Measurement", "Drug", "Drug", "Conditio…
$ vocabulary_id    <chr> "ICD10CM", "RxNorm", "CVX", "RxNorm", "SNOMED", "SNOMED", "LOINC", "RxNorm", "NDC", "ICD10CM", "CVX", "…
$ concept_class_id <chr> "4-char billing code", "Branded Drug", "CVX", "Ingredient", "Procedure", "Procedure", "Lab Test", "Clin…
$ standard_concept <chr> NA, "S", "S", "S", "S", "S", "S", "S", NA, NA, "S", "S", "S", "S", "S", "S", NA, "S", "S", "S", "S", "S…
$ concept_code     <chr> "K92.2", "213469", "33", "46041", "232717009", "76601001", "2885-2", "789980", "00025152531", "K92", "1…
$ valid_start_date <date> 2007-01-01, 1970-01-01, 2008-12-01, 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 2008-03-30, 2000-0…
$ valid_end_date   <date> 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099-1…
$ invalid_reason   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

CDM vocabulary tables

cdm$condition_occurrence %>%
  group_by(condition_concept_id) %>%
  tally() %>%
  left_join(
    cdm$concept %>%
      select("concept_id", "concept_name"),
    by = c("condition_concept_id" = "concept_id")
  ) %>%
  collect() %>%
  arrange(desc(n))

# A tibble: 80 × 3
   condition_concept_id     n concept_name                            
                  <int> <dbl> <chr>                                   
 1             40481087 17268 Viral sinusitis                         
 2              4112343 10217 Acute viral pharyngitis                 
 3               260139  8184 Acute bronchitis                        
 4               372328  3605 Otitis media                            
 5                80180  2694 Osteoarthritis                          
 6                28060  2656 Streptococcal sore throat               
 7                81151  1915 Sprain of ankle                         
 8               378001  1013 Concussion with no loss of consciousness
 9              4283893  1001 Sinusitis                               
10              4294548   939 Acute bacterial sinusitis               
# ℹ 70 more rows

CDM vocabulary tables

cdm$concept_ancestor %>% glimpse()

Rows: ??
Columns: 4
Database: DuckDB v1.3.1 [unknown@Linux 6.11.0-1015-azure:R 4.5.1//home/runner/work/RealWorldEvidenceSummerSchool2025/RealWorldEvidenceSummerSchool2025/GiBleed.duckdb]
$ ancestor_concept_id      <int> 4180628, 4179141, 21500574, 21505770, 21503967, 36203060, 36151386, 21502552, 40765628, 4433964…
$ descendant_concept_id    <int> 313217, 4146173, 1118084, 1119510, 40162522, 40479422, 1119510, 1112807, 40769189, 19128009, 15…
$ min_levels_of_separation <int> 5, 2, 4, 0, 5, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0,…
$ max_levels_of_separation <int> 6, 2, 4, 0, 6, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0,…

CDM vocabulary tables

cdm$concept_relationship %>% glimpse()

Rows: ??
Columns: 6
Database: DuckDB v1.3.1 [unknown@Linux 6.11.0-1015-azure:R 4.5.1//home/runner/work/RealWorldEvidenceSummerSchool2025/RealWorldEvidenceSummerSchool2025/GiBleed.duckdb]
$ concept_id_1     <int> 192671, 1118088, 1569708, 35208414, 35208414, 40162359, 44923712, 45011828
$ concept_id_2     <int> 35208414, 44923712, 35208414, 192671, 1569708, 45011828, 1118088, 40162359
$ relationship_id  <chr> "Mapped from", "Mapped from", "Subsumes", "Maps to", "Is a", "Mapped from", "Maps to", "Maps to"
$ valid_start_date <date> 1970-01-01, 1970-01-01, 2016-03-25, 1970-01-01, 2016-03-25, 2009-08-03, 1970-01-01, 2009-08-03
$ valid_end_date   <date> 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31, 2099-12-31
$ invalid_reason   <chr> NA, NA, NA, NA, NA, NA, NA, NA

CDM vocabulary tables

cdm$concept_synonym %>% glimpse()

Rows: ??
Columns: 3
Database: DuckDB v1.3.1 [unknown@Linux 6.11.0-1015-azure:R 4.5.1//home/runner/work/RealWorldEvidenceSummerSchool2025/RealWorldEvidenceSummerSchool2025/GiBleed.duckdb]
$ concept_id           <int> 964261, 1322184, 441267, 1718412, 4336464, 4102123, 4237458, 4280726, 4330583, 3014576, 4242997, 30…
$ concept_synonym_name <chr> "cyanocobalamin 5000 MCG/ML Injectable Solution", "clopidogrel", "Cystic fibrosis (disorder)", "Kyl…
$ language_concept_id  <int> 4180186, 4180186, 4180186, 4180186, 4180186, 4180186, 4180186, 4180186, 4180186, 4180186, 4180186, …

Exploring vocabulary tables using CodelistGenerator

Vocabulary version

Search results will be specific to the version of the vocabulary being used

getVocabVersion(cdm = cdm)

[1] "v5.0 18-JAN-19"

Available vocabularies

What vocabularies are available?

getVocabularies(cdm = cdm)

[1] "CVX"     "Gender"  "ICD10CM" "LOINC"   "NDC"     "None"    "RxNorm"  "SNOMED"  "Visit"

Available domains

What domains are present?

getDomains(cdm = cdm)

[1] "Drug"        "Measurement" "Procedure"   "Condition"   "Observation" "Visit"       "Gender"

Concept classes

What concept classes are present?

getConceptClassId(
  cdm = cdm,
  standardConcept = "Standard",
  domain = "Drug"
)

[1] "Branded Drug"        "Branded Drug Comp"   "Branded Pack"        "Clinical Drug"       "Clinical Drug Comp" 
[6] "CVX"                 "Ingredient"          "Quant Branded Drug"  "Quant Clinical Drug"

getConceptClassId(
  cdm = cdm,
  standardConcept = "Standard",
  domain = "Condition"
)

[1] "Clinical Finding"

Relationship ID

What relationships do we have between standard concepts?

getRelationshipId(
  cdm = cdm_mock,
  standardConcept1 = c("standard"),
  standardConcept2 = c("standard"),
  domains1 = "condition",
  domains2 = "condition"
)

[1] "Due to of"

What relationships do we have between non-standard to standard concepts?

getRelationshipId(
  cdm = cdm_mock,
  standardConcept1 = c("standard"),
  standardConcept2 = c("non-standard"),
  domains1 = "condition",
  domains2 = "condition"
)

[1] "Mapped from"

Drug dose forms

getDoseForm(cdm = cdm_mock)

[1] "Injectable"            "Injection"             "Nasal Powder"          "Topical Liquefied Gas"

Your turn

Using a cdm reference you have connected to:

What is the vocabulary version of the cdm?
How many concepts are in your concept table? How many of these are standard concepts?
What domains are available? Which domains would you use if you were defining a cohort of people with asthma?

Solution

💡 Click to see solution

getVocabVersion(cdm = cdm)

cdm$concept |>
  tally()

getDomains(cdm = cdm)

Vocabulary based codelists using CodelistGenerator

Vocabulary-based codelists using CodelistGenerator

We can use drug hierarchies and relationships to create vocabulary-based codelists.

Drug ingredients

ingredients <- getDrugIngredientCodes(cdm = cdm, nameStyle = "{concept_name}")
ingredients


- acetaminophen (7 codes)
- albuterol (2 codes)
- alendronate (2 codes)
- alfentanil (1 codes)
- alteplase (2 codes)
- amiodarone (2 codes)
along with 85 more codelists

ingredients$warfarin

[1]  1310149 40163554

cdm$concept |>
  filter(concept_id %in% c(1310149, 40163554))

# Source:   SQL [?? x 10]
# Database: DuckDB v1.3.1 [unknown@Linux 6.11.0-1015-azure:R 4.5.1//home/runner/work/RealWorldEvidenceSummerSchool2025/RealWorldEvidenceSummerSchool2025/GiBleed.duckdb]
  concept_id concept_name   domain_id vocabulary_id concept_class_id standard_concept concept_code valid_start_date valid_end_date
       <int> <chr>          <chr>     <chr>         <chr>            <chr>            <chr>        <date>           <date>        
1    1310149 Warfarin       Drug      RxNorm        Ingredient       S                11289        1970-01-01       2099-12-31    
2   40163554 Warfarin Sodi… Drug      RxNorm        Clinical Drug    S                855332       2009-08-02       2099-12-31    
# ℹ 1 more variable: invalid_reason <chr>

ATC classifications

atc <- getATCCodes(cdm = cdm_mock, nameStyle = "{concept_name}")
atc


- alimentary_tract_and_metabolism (2 codes)

atc$alimentary_tract_and_metabolism

[1] 12 13

ICD10 chapters

icd <- getICD10StandardCodes(cdm = cdm_mock, nameStyle = "{concept_name}")
icd


- arthropathies (3 codes)
- diseases_of_the_musculoskeletal_system_and_connective_tissue (3 codes)

icd$arthropathies

[1] 3 4 5

Your turn

Using Eunomia data:

Get codes for memantine using getDrugIngredientCodes. How many codes do you include?
How how many records for memantine are in the drug exposure table (hint: filter on the drug_concept_id field from the drug_exposure table)?
- 0
- 67
- 110
- 245

Solution

💡 Click to see solution

memantine_codes <- getDrugIngredientCodes(cdm = cdm, name = "memantine")
memantine_codes

cdm$drug_exposure |>
  filter(drug_concept_id %in% !!memantine_codes[[1]]) |>
  tally()

Systematic search using CodelistGenerator

CodelistGenerator is used to create a candidate set of codes for helping to define patient cohorts in data mapped to the OMOP common data model.

A little like the process for a systematic review, the idea is that for a specified search strategy, CodelistGenerator will identify a set of concepts that may be relevant, with these then being screened to remove any irrelevant codes.

Codes for asthma

asthma_codes <- getCandidateCodes(
  cdm = cdm,
  keywords = "asthma",
  domains = "Condition"
)
asthma_codes %>% glimpse()

Rows: 2
Columns: 6
$ concept_id       <int> 4051466, 317009
$ found_from       <chr> "From initial search", "From initial search"
$ concept_name     <chr> "Childhood asthma", "Asthma"
$ domain_id        <chr> "Condition", "Condition"
$ vocabulary_id    <chr> "SNOMED", "SNOMED"
$ standard_concept <chr> "S", "S"

asthma_cs <- newCodelist(list("asthma" = asthma_codes$concept_id))
asthma_cs


- asthma (2 codes)

Your turn

Using Eunomia data:

Search for codes for sinusitis recorded in the condition domain
Do you identify any more codes if you also search in the observation domain as well as the condition domain

Solution

💡 Click to see solution

sinusitis_codes <- getCandidateCodes(
  cdm = cdm,
  keywords = "sinusitis",
  domains = "Condition"
)
nrow(sinusitis_codes)

sinusitis_codes_2 <- getCandidateCodes(
  cdm = cdm,
  keywords = "sinusitis",
  domains = c("Condition", "Observation")
)
nrow(sinusitis_codes_2)

Codelist diagnostics

Code counts

library(omopgenerics)
asthma_code_use <- summariseCodeUse(
  x = asthma_cs,
  byYear = TRUE,
  bySex = TRUE,
  ageGroup = list(
    c(0, 17),
    c(18, 65),
    c(66, 150)
  ),
  cdm = cdm
) |>
  suppress(minCellCount = 5)

tableCodeUse(result = asthma_code_use |>
  filter(strata_name == "overall"))

										Database name
										GiBleed
Codelist name	Year	Sex	Age group	Standard concept name	Standard concept ID	Source concept name	Source concept ID	Source concept value	Domain ID	Estimate name
Codelist name	Year	Sex	Age group	Standard concept name	Standard concept ID	Source concept name	Source concept ID	Source concept value	Domain ID	Record count	Person count
asthma	overall	overall	overall	overall	-	NA	NA	NA	NA	101	101
				Asthma	317009	Asthma	317009	195967001	condition	5	5
				Childhood asthma	4051466	Childhood asthma	4051466	233678006	condition	96	96

Code counts

tableCodeUse(result = asthma_code_use |>
  filter(
    strata_name == "year",
    strata_level %in% c("2004", "2005", "2006")
  ))

										Database name
										GiBleed
Codelist name	Year	Sex	Age group	Standard concept name	Standard concept ID	Source concept name	Source concept ID	Source concept value	Domain ID	Estimate name
Codelist name	Year	Sex	Age group	Standard concept name	Standard concept ID	Source concept name	Source concept ID	Source concept value	Domain ID	Record count	Person count
asthma	2004	overall	overall	overall	-	NA	NA	NA	NA	<5	<5
	2006	overall	overall	overall	-	NA	NA	NA	NA	<5	<5
	2004	overall	overall	Asthma	317009	Asthma	317009	195967001	condition	<5	<5
	2006	overall	overall	Asthma	317009	Asthma	317009	195967001	condition	<5	<5

Code counts

tableCodeUse(result = asthma_code_use |>
  filter(strata_name == "age_group"))

										Database name
										GiBleed
Codelist name	Year	Sex	Age group	Standard concept name	Standard concept ID	Source concept name	Source concept ID	Source concept value	Domain ID	Estimate name
Codelist name	Year	Sex	Age group	Standard concept name	Standard concept ID	Source concept name	Source concept ID	Source concept value	Domain ID	Record count	Person count
asthma	overall	overall	0 to 17	overall	-	NA	NA	NA	NA	96	96
			18 to 65	overall	-	NA	NA	NA	NA	5	5
				Asthma	317009	Asthma	317009	195967001	condition	5	5
			0 to 17	Childhood asthma	4051466	Childhood asthma	4051466	233678006	condition	96	96

Your turn

Using Eunomia data:

Identify codes for appendicitis from the condition domain
Make a table with counts of the usage of these codes

Solution

💡 Click to see solution

appendicitis_codes <- getCandidateCodes(
  cdm = cdm,
  keywords = "appendicitis",
  domains = "Condition"
)
tableCodeUse(result = appendicitis)

CodelistGenerator

👉 Packages website
👉 CRAN link
👉 Manual

📧 edward.burn@ndorms.ox.ac.uk