## Create clean data dictionary # Activate packages ------------------------------------------------------- pacman::p_load( "here", "rio", "tidyverse", "janitor" ) remotes::install_github("epicentre-msf/datadict") # # Load/install packages from GitHub pacman::p_load_gh( # data dictionary "epicentre-msf/datadict" # create/validate data dictionary ) ## Import the clean dataset # Import data ------------------------------------------------------------- dat <- rio::import(here("15_Vaccination_Survey","2022_Bentiu_Measles", "4_Data", "clean", "bentiu_measles_vcs_2022_clean.xlsx"), setclass = "tbl") ## Data cleaning for dataset dat <- dat %>% mutate(date = excel_numeric_to_date(as.numeric(date))) %>% ## remove the id and uuid variables created by kobo select(- c("id", "uuid")) # Create data dictionary -------------------------------------------------- bentiu_vcs_clean_dict <- dict_from_data(dat) # Data cleaning for data dictionary ----------------------------------------------------------- # clean up numeric and coded list variables bentiu_vcs_clean_dict <- bentiu_vcs_clean_dict %>% mutate(type = case_when( variable_name == "random_child" ~ "Numeric", variable_name == "block_name" ~ "Coded list", variable_name %in% c("vaccination_status", "vaccination_status_simple", "vaccination_status_msf", "vaccination_status_simple_msf", "vaccination_status_rout", "vaccination_status_simple_rout", "diagnosis_disease") ~ "Coded list", variable_name == "date_of_survey" ~ "Date", .default = type )) derived_vars <- c( "age_group", "age_group_mon", "age_category", "vaccination_status", "vaccination_status_simple", "vaccination_status_msf", "vaccination_status_simple_msf", "vaccination_status_rout", "vaccination_status_simple_rout", "diagnosis_disease", "sector_block", "weight_simple" ) ## Create a loop that replaces the value of original to derived based on being a derived variable for (i in 1:nrow(bentiu_vcs_clean_dict)){ if(bentiu_vcs_clean_dict$variable_name[i] %in% derived_vars) { bentiu_vcs_clean_dict$origin[i] <- "derived" } } ## Add values for block name and additional variables bentiu_vcs_clean_dict <- bentiu_vcs_clean_dict %>% mutate(choices = case_when( variable_name == "block_name" ~ c("0, Block 1 | 1, Block 2 | 2, Block 3 | 3, Block 4 | 4, Block 5 | 5, Block 6 | 6, Block 7 | 7, Block 8 | 8, Block 9 | 9, Block 10 | 10, Block 11 | 11, Block 12 | 12, Block 13 | 13, Block 14 | 14, Block 15 | 15, Block 16"), variable_name %in% c("vaccination_status_simple", "vaccination_status_simple_msf", "vaccination_status_simple_rout", "diagnosis_disease") ~ c("0, TRUE | 1, FALSE"), variable_name == "age_category" ~ c("0, 0-5 months | 1, 12-59 months | 2, 5-14 years | 3, 6-8 months | 4, 9-12 months "), .default = choices )) # Pseudonymisation process ------------------------------------------------ vars_withhold <- c( "age_months", "age_years" ) ## Create a copy of the dictionary dict <- bentiu_vcs_clean_dict dict$status[dict$variable_name %in% vars_withhold] <- "withheld" ## Assess re-identification risk criterion using all indirect identifiers vars_indirect <- c( "sex", "age_group", "age_group_mon", "age_category", "sector_name", "block_name" ) test <- datadict::k_anonymity_counts(dat, vars_indirect, threshold = 5) ## Check other combinations of variables to see if more need to be withheld datadict::k_anonymity_counts(dat, c("sex", "age_group"), threshold = 5) datadict::k_anonymity_counts(dat, c("sex", "age_group_mon" ), threshold = 5) datadict::k_anonymity_counts(dat, c("sex", "age_category" ), threshold = 5) datadict::k_anonymity_counts(dat, c("sex", "age_group", "sector_name" ), threshold = 5) datadict::k_anonymity_counts(dat, c("sex", "age_group", "block_name" ), threshold = 5) ## Sector and block name variables make individuals more directly identifiable and need to be withheld vars_withhold <- c( "sector_name", "block_name", "age_category", "age_group_mon" ) dict$status[dict$variable_name %in% vars_withhold] <- "withheld" # Remove withheld variables from dataset ---------------------------------- dat$sector_name <- NA dat$block_name <- NA dat$age_category <- NA dat$age_months <- NA dat$age_years <- NA dat$age_group_mon <- NA datadict::valid_dict(dict) datadict::valid_data(dat, dict) ### Export cleaned dictionary and dataset export(dat, here("15_Vaccination_Survey","2022_Bentiu_Measles", "4_Data", "clean", "bentiu_measles_data_vcs_2022_clean_SHARE.xlsx")) export(dict, here("15_Vaccination_Survey","2022_Bentiu_Measles", "4_Data", "clean", "bentiu_measles_dict_vcs_2022_clean_SHARE.xlsx"))