Add log feature

Add ability to print full log of all duawranglr commands

TO DO: write template function

Write a function that produces a template script. Have function ask a few questions to set options (i.e., should the data be deidentified). Script should be saved with robust comments.

TO DO: Better README

Fill out README.md with basic examples.

TO DO: move messages to common file

Move messages to common file so that repeated messages have one source.

Add a sanitization suggestion tool

Feature Request

When I need to review large data sets with many tables and columns it would be helpful if there was a function that would suggest the columns and tables that may contain likely PII content. Several methods of doing this are in the following script (this is a get'r done script, and would need to be re-built for this purpose, to suggest, instead of sanitize everything)

# License for this script: [CC0 1.0 Universal (CC0 1.0) Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/)
library(tidyverse) #data wrangling
library(data.table) #data wrangling
library(dplyr) #data wrangling
library(lubridate) #handle dates
library(stringr)# https://evoldyn.gitlab.io/evomics-2018/ref-sheets/R_strings.pdf
library(loggit) # for simple logging
library(openssl) # # Used to generate hash values
### Generic functions !!! ###

.get.random.character <- function(...,percentUpper=.4,percentLower=.4,percentNumber=.1, percentSymbol=.1){
  # Should never return "_"
  percentUpper <- percentUpper /(percentUpper + percentLower + percentNumber + percentSymbol)
  percentLower <- percentLower /(percentUpper + percentLower + percentNumber + percentSymbol)
  percentNumber <- percentNumber /(percentUpper + percentLower + percentNumber + percentSymbol)
  percentSymbol <- percentSymbol /(percentUpper + percentLower + percentNumber + percentSymbol)
  # Using the restricted symbols set used by the LASTPASS random password generator
  SYMBOLS <- c("!","&","%","#","$","@","*","^")
  .randomNumber <- rand_num(n=1)
  ifelse(
    .randomNumber < percentUpper,
    LETTERS[as.integer(rand_num(n=1)*26+1)],
    ifelse(
      .randomNumber < (percentUpper+percentLower),
      letters[as.integer(rand_num(n=1)*26+1)],
      ifelse(
        .randomNumber < (percentUpper+percentLower+percentNumber),
        as.integer(rand_num(n=100)+1),
        ifelse(
          .randomNumber < (percentUpper+percentLower+percentNumber+percentSymbol),
          SYMBOLS[as.integer(rand_num(n=1)*length(SYMBOLS)+1)],
          "_"
        )
      )  
    )
  )
}

.generate.random.string <- function(n=1,UsePseudoRandomSeed=TRUE){
  # this should set a new seed every time it's called based on time script run and sessions PID
  if(UsePseudoRandomSeed){
    set.seed(as.numeric(Sys.time(), digits=17)*10^8-(as.integer(as.numeric(Sys.time(), digits=17))*10^8)+Sys.getpid())
  }
  paste(
    unlist(lapply(X=1:n,FUN=.get.random.character)),
    collapse=""
  )
}

### Initalization functions
.normalize.path <- function(
  somePath,
  winslash=if (.Platform$OS.type == "windows") {
    fileSeperator <- "\\"
  } else {
    fileSeperator <- .Platform$file.sep
  },
  mustWork=FALSE
){
  # was having many issues using winslash=.Platform$file.sep, switching to winslash="\\\\" works
  normalizePath(somePath, winslash=winslash, mustWork=mustWork)
}
.file.path<-function(
  ...,
  fsep=if (.Platform$OS.type == "windows") {
    fileSeperator <- "\\"
  } else {
    fileSeperator <- .Platform$file.sep
  }
){
  # was having many issues using fsep=.Platform$file.sep, switching to fsep="\\\\" works
  .normalize.path(.Internal(file.path(list(...), fsep)))
}

## Script unique functions ###

# only looking at columns that contain a string longer than 7 characters
# the average character of surname or firstname is roughly 6 and one standard deviation is roughly 1.5
# so if a column only had 7 first or last names in it, it's highly likely that at least one would be longer than 8 charaters
.validate.is.short.name.string <- function(x){
   maxChars <- max(nchar(x))
   return(is.character(x) && maxChars > 7)
}

.choose_directory <- function(caption= 'Select data directory') {
# .choose_directory() is modified from https://stackoverflow.com/a/48243694
  if (exists('utils::choose.dir')) {
    utils::choose.dir(caption = caption)
  } else {
    tcltk::tk_choose.dir(caption = caption)
  }
}

.sanitizeFiles <- function(sourceFilePathToSanitize){
  fCancel <- FALSE
  # Get data set
  attempt <- tryCatch({
    dataSet <- fread(sourceFilePathToSanitize)
    if(nrow(dataSet)<=1){
      message("no data found::skipped file transfer for:",basename(sourceFilePathToSanitize))
      fCancel <- TRUE
    }
  }, error = function(e) {
    message(
      "Warning :: resuming - next file ->  failed to read file: ",
      basename(sourceFilePathToSanitize),
      "" # "\n Here's the original error message:\n", e
    )  
    fCancel <- TRUE
  })
  if(!fCancel){
    attempt <- tryCatch({
      fwrite(
        x=data.table(apply(X=dataSet, MARGIN=2, FUN=str_length)),
        file=.file.path(destinationPath,"mask-str_length",basename(sourceFilePathToSanitize))
      )
    }, error = function(e) {
      message(
        "Warning :: resuming - even though ->  failed to write length mask file: ",
        basename(sourceFilePathToSanitize),
        "" # "\n Here's the original error message:\n", e
      )  
    })
    attempt <- tryCatch({
      # a little prep :: trim everything that's a string
      dataSet.chars <- dataSet %>% Filter(f=is.character)
      if(length(names(dataSet.chars ))>0){
        dataSet.chars <- lapply(X=dataSet.chars, FUN=str_trim)
        dataSet <- dplyr::bind_cols(
          select(dataSet,-data.frame(names(dataSet.chars))[[1]]),
          dataSet.chars
        )
      }
    }, error = function(e) {
      message(
        "Warning :: resuming - even though -> failed to apply trim on character columns: ",
        basename(sourceFilePathToSanitize),
        "" # "\n Here's the original error message:\n", e
      )
    })
    attempt <- tryCatch({
      ### The memo field hatchet - broadly hash all possible memo field by hashing all long text columns,  see requirements documentation to enhance this
      #1- Prep/find: Hash columns of text over 22 characters in length for text fields that could contain memo fields
      dataSetFiltered <- dataSet %>%
        Filter(f=function(x) is.character(x) && max(nchar(x)) > 22)
      #3- Clean: Replace a zero length strings with NA (So we don't hash them)
      dataSetFiltered <- mutate_all(dataSetFiltered, list(~na_if(.,"")))
      #4- Process: Hash everything found
      dataSetFiltered <-mutate_all(dataSetFiltered, list(~.hash(.)))
      #5- Replace: updated column(s)
      columnNamesFiltered <- data.frame(names(dataSetFiltered))
      dataSetResults <- dplyr::bind_cols(select(dataSet,-(columnNamesFiltered[[1]])), dataSetFiltered)
      # Clean as we go...    
      suppressWarnings(rm("dataSetFiltered"))
      #suppressWarnings(gc(verbose=TRUE,reset=TRUE))
    }, error = function(e) {
      message(
        "Warning :: resuming - even though -> failure durring find/processsing of long text columns: ",
        basename(sourceFilePathToSanitize),
        "" # "\n Here's the original error message:\n", e
      )  
    })  
    attempt <- tryCatch({
      ### SSN/EmployeeId/PhoneNumbers
      #1- Prep: removed all these characters from our search set for SSN, EmployeeId, PhoneNumbers searches: '-','(',')','\','/'
      dataSetSearchChar <- dataSet %>%
        Filter(f=is.character) %>%
        select(-(columnNamesFiltered[[1]]))

      dataSetSearchChar <- mutate_all(dataSetSearchChar, list(~str_remove_all(string=.,"[\\/: \\-\\(\\),\\.\\/\\\\]")))
      #2- Clean replace a zero lenth strings with NA (So we don't search or hash them)
      dataSetSearchChar <- mutate_all(dataSetSearchChar, list(~na_if(.,"")))
      # set NA mask for counting value is present
      dataSetSearchFilteredCheckIsNotNa <-
   mutate_all(dataSetSearchChar , list(~suppressWarnings(!is.na(.))))
      # set string character length mask
      # remove all NAs for further processing/searching
      dataSetSearchChar[is.na(dataSetSearchChar)] <- ""
      #3- Find likely/possible columns
      dataSetCharacterLengthMask <- mutate_all(dataSetSearchChar, list(~str_length(.)))
      #colSums(dataSetCharacterLengthMask)
      #NOT USED: colMeans(dataSetCharacterLengthMask ) # Don't use colMeans here as we would be inlcuding all NAs/empties, so do the math
      dataSetMeanStringLength <- colSums(dataSetCharacterLengthMask)/colSums(dataSetSearchFilteredCheckIsNotNa)
      dataSetSearchCharCheckForNumeric <-
        mutate_all(dataSetSearchChar, list(~str_sub(string=.,start=1,end=4)))
      dataSetSearchCharCheckForNumeric <-
        mutate_all(dataSetSearchCharCheckForNumeric , list(~suppressWarnings(!is.na(as.numeric(.)))))
      #Find columns with a percent
      percentValuesNumeric <-
        (
          colSums(dataSetSearchCharCheckForNumeric)/colSums(dataSetSearchFilteredCheckIsNotNa)
        )
      # filter to mostly numerics for the SSN/EmployeeId/PhoneNumber search
      # only look at mostly numeric values after removing dash,hyphen ect. of the correct length...
      dataSetCharWithNumericColumns <- names(percentValuesNumeric[percentValuesNumeric > .6])
      dataSetMeanStringLength.ssn <- dataSetMeanStringLength[dataSetMeanStringLength < 9.1]
      dataSetMeanStringLength.ssn <- dataSetMeanStringLength.ssn[dataSetMeanStringLength.ssn > 8.9 ]
      dataSetPotentialSsnColumns <- names(dataSetMeanStringLength.ssn)
 
      # APPLY filter for mostly numeric on search /found columns...
      dataSetPotentialSsnColumns  <- intersect(dataSetPotentialSsnColumns ,dataSetCharWithNumericColumns)
      # [TODO] before assuming a column is a SSN should check that no values (no more that a couple of fake user accounts)
      # that are 9 characters long contain leading, trailing or mid 0's,
      dataSetPotentialPhoneEmployeeIdColumns <-  
        dataSetMeanStringLength  %>%
        Filter(f=function(x) x < 10.0001 && x > 9.91) %>%
        data.frame() %>%
        row.names()
      dataSetPotentialPhoneEmployeeIdColumns  <- intersect(dataSetPotentialPhoneEmployeeIdColumns ,dataSetCharWithNumericColumns)

      # check the rest of the columns for potential SSN/EmployeeId columns
      dataSetSearchNotChar <- dataSet %>%
        select(-(columnNamesFiltered[[1]])) %>%
        select(-(data.frame(names(dataSetSearchChar))[[1]]))

      dataSetSearchMaskIsNotZero <- mutate_all(dataSetSearchNotChar, list(~!(.) == 0 ))
      .is.ssn.EmployeeId.Range <- function(x){
        # SSNs are 9 digits long  | {OR} employee IDs that are 10 digits long and begin with 1
        (x > (10^8-1) & x < (10^9-1)) | (x > (10^9 * 1 - 1) & x < (10^9 * 2 - 1))
      }
      dataSetSearchMaskIsSsnEmployeeIdRange <- mutate_all(dataSetSearchNotChar, list(~(.is.ssn.EmployeeId.Range(x=.))))
      # Percent of column that contains ssn or EmployeeId range values (excluding 0 and na values)
      dataSetSearchPercentIsSsnEmployeeIdRange <- colSums(dataSetSearchMaskIsSsnEmployeeIdRange) /  colSums(dataSetSearchMaskIsNotZero)
      # Add newly found columns
      dataSetPotentialPhoneEmployeeIdColumns <- dataSetPotentialPhoneEmployeeIdColumns %>%  union(
        dataSetSearchPercentIsSsnEmployeeIdRange %>%
        Filter(f=function(x) x <= 1 && x >.9 ) %>%
        names()
      )
      # Not using {Mean search = Sum of columns /  Count of non-zero values} percent of those that fall in range is better...
      # NOT RUN: dataSetMeanSearchNotChar <- colSums(dataSetSearchNotChar) / colSums(dataSetSearchMaskIsNotZero)

    if(length(c(dataSetPotentialSsnColumns,dataSetPotentialPhoneEmployeeIdColumns))>0){
        dataSetFiltered <- dataSetResults %>% select(data.frame(c(dataSetPotentialSsnColumns,dataSetPotentialPhoneEmployeeIdColumns))[[1]])
        dataSetFiltered <- mutate_all(dataSetFiltered, list(~na_if(.,"")))
        #4- Process: Hash everything found
        dataSetFiltered <-mutate_all(dataSetFiltered, list(~.hash(.)))
        #5- Replace: updated column(s)
        columnNamesFiltered.ssn <- data.frame(names(dataSetFiltered))
        dataSetResults <- dplyr::bind_cols(select(dataSetResults,-(columnNamesFiltered.ssn[[1]])), dataSetFiltered)
        columnNamesFiltered <- suppressWarnings(bind_rows(columnNamesFiltered, columnNamesFiltered.ssn))
        # Clean as we go...    
        suppressWarnings(rm("dataSetFiltered"))
        suppressWarnings(gc(verbose=TRUE,reset=TRUE))
      }
    }, error = function(e) {
      message(
        "Warning :: resuming - even though -> failure durring SSN/EmployeeId/PhoneNumbers processing: ",
        basename(sourceFilePathToSanitize),
        "" # "\n Here's the original error message:\n", e
      )  
    })    
    attempt <- tryCatch({
      ### NAMES, first or last beginnig matches
      #1/2- Prep/Clean: Apply column filter 1
      dataSetSearchChar <-  dataSetSearchChar %>% select(-data.frame(c(dataSetPotentialSsnColumns,dataSetPotentialPhoneEmployeeIdColumns))[[1]])
      dataSetCharWithTextColumns <- names(percentValuesNumeric[percentValuesNumeric<=.5])
      #3- Find: looking for names columns, so charaters, not begining with numbers, with character lengths with mean between 4 and 7, or > 10 (first, middle and last... could be more)
      dataSetPotentialNameColumns <- names(dataSetMeanStringLength  %>% Filter(f=function(x) x < 7 && x > 4 || x > 10 ))
      dataSetPotentialNameColumns <-
        intersect(
          dataSetPotentialNameColumns,
          intersect(
            names(dataSetSearchChar),
            dataSetCharWithTextColumns
          )
        )
      dataSetSearchChar <- dataSetSearchChar %>% select(data.frame(dataSetPotentialNameColumns)[[1]])
      dataSetSearchForName <- mutate_all(dataSetSearchChar, list(~str_to_upper(str_sub(string=.,start=1,end=5))))
      dataSetSearchForNameFound <-
        mutate_all(dataSetSearchForName , list(~(.) %in% lstName.short))
      dataSetFilteredIsNotNaSums <- dataSetSearchFilteredCheckIsNotNa %>% select(data.frame(names(dataSetSearchForName))[[1]]) %>% colSums()
      dataSetFiltered.name.percent <- colSums(dataSetSearchForNameFound)/dataSetFilteredIsNotNaSums
      #Filter on >15% of present values start 5 chars with a match to a name
      dataSetFiltered.name.percent <- dataSetFiltered.name.percent[dataSetFiltered.name.percent>.15]
      dataSetFiltered <- dataSetResults %>% select(names(dataSetFiltered.name.percent))
      #3- Clean: Replace a zero lenth strings with NA (So we don't hash them)
      dataSetFiltered <- mutate_all(dataSetFiltered, list(~na_if(.,"")))
      #4- Process: Hash everything found
      dataSetFiltered <-mutate_all(dataSetFiltered, list(~.hash(.)))
      #5- Replace: updated column(s)'
      # Add these to existing list
      columnNamesFilteredName <- data.frame(names(dataSetFiltered))
      dataSetResults <- dplyr::bind_cols(select(dataSetResults,-(columnNamesFilteredName[[1]])), dataSetFiltered)
      # Keeping track of all columns updated  
      columnNamesFiltered <- suppressWarnings(bind_rows(columnNamesFiltered,columnNamesFilteredName))
    }, error = function(e) {
      message(
        "Warning :: resuming - even though -> failure durring find Person Names processing: ",
        basename(sourceFilePathToSanitize),
        "" # "\n Here's the original error message:\n", e
      )  
    })  
    attempt <- tryCatch({
      ### Wrap up and Write out our results
      destinationFileName <- .file.path(destinationPath, basename(sourceFilePathToSanitize))
      appemptDeleteFilePriorToWriting <- suppressWarnings(file.remove(destinationFileName ))
      destinationWasAvailable <- ! file.exists(destinationFileName) && dir.exists(dirname(destinationFileName))
      fwrite(x=dataSetResults,file=destinationFileName )
      message(
        if(destinationWasAvailable && file.exists(destinationFileName)){
          paste0(
            "Replaced values in file <",basename(sourceFilePathToSanitize),
            "> with hash of values from fields: ",paste(unique(columnNamesFiltered[[1]]),collapse=" | ")
          )  
        }else{
          paste0("ERROR: Check Destination: Failed to write file:",sourceFilePathToSanitize)
        }
      )
    }, error = function(e) {
      message(
        "Warning :: resuming - even though -> failure durring write results: ",
        basename(sourceFilePathToSanitize),
        "" # "\n Here's the original error message:\n", e
      )  
    })    
  }
  # Cleanup to free up RAM
  suppressWarnings(rm(list=ls(pattern="^data\\.set")))
  suppressWarnings(gc(verbose=TRUE,reset=TRUE))
}

### Execute script ###
### Set Configuration variables ###
suppressWarnings(rm(list=c("sourcePath","destinationPath","my.salt")))
configPath <- .file.path(Sys.getenv("R_CUSTOM_INITILIZATION_PATH"),"config","sanitization.R")
fSourceExists <- FALSE
fCancel <- FALSE
if(file.exists(configPath)){
  source(configPath)
  if(exists("sourcePath")){
    if(dir.exists(sourcePath)){
      fSourceExists <- TRUE
    }
  }  
}
if(!fSourceExists){
  sourcePath <- .choose_directory(caption="Select the Source Directory")
  if(is.na(sourcePath)){
    fCancelMessage("User canceled selecting 'Source Path': Exiting script")
    fCancel <- TRUE
  }
}
if(!fCancel){
  fDestinationExists <- FALSE
  if (exists("destinationPath")){
    suppressWarnings(dir.create(destinationPath))
    # [TODO] add interactive selection method for linux systems
    if(dir.exists(sourcePath)){
      fDestinationExists <- TRUE
    }
  }
  if(!fDestinationExists){
    destinationPath <- .choose_directory(caption="Select the Destination Directory")
    if(is.na(sourcePath)){
      message("User canceled selecting 'Destination Path': Exiting script")
      fCancel <- TRUE
    }
  }  
}
if(!fCancel){
# Logging! Every message, warning, and stop will now also write to this path ...
loggit::set_logfile(.file.path(destinationPath,"cleanupData-Log.ndjson"))
  # a file that contains only a secret passphrase
  if(file.exists(configPath)){
    if(file.exists(.file.path(dirname(configPath),"my_secret_salt"))){
      my.salt <- fread(.file.path(dirname(configPath),"my_secret_salt"),header=FALSE)[[1]]
    }
  }
  if(!exists("my.salt")){
    #[TODO] use askpass, instead of realine for a maked passphrase if it is available
    my.salt <-  readline(prompt=
      paste0(
        "Type a passphrase to salt your hashes with and hit {Enter}, \n  ",
        "if left blank a random 32 character string will be used:\n"
      )
    )
    if(!exists("my.salt")){
      my.salt <- .generate.random.string(32)
    }
    if(is.na(my.salt)){
      my.salt <- .generate.random.string(32)
    }
    if(str_length(my.salt) == 0) {
      my.salt <- .generate.random.string(32)
    }
  }
  if(!exists("my.salt")){
    fCancel <- TRUE
  }
  if(fCancel || str_length(my.salt)==0){
    fCancel<-TRUE
  }
}
.hash <- function(x){
  # using sha256 with cusom salt for strength, and then truncate to 32 characters 128bits to reduce size (stronger than md5, same storage)...
  attempt <- try(str_sub(sha256(as.character(x),my.salt),end=32))
  # NULL and NA, or non-characters return NULL
  if(class(attempt)=="try-error"){return(NULL)}else{return(attempt)}
}
if(!fCancel){
  ### Begin Processing ###
  gc(verbose=TRUE,reset=TRUE)
  # log start and machine/config info
  message(paste0(
    "Begin PII/Memo sanitization to ",Sys.getenv("NUMBER_OF_PROCESSORS"), " processors and ",
    round( memory.limit()/2^10,2) ," GB RAM and OpenSSL with:",  
    paste(paste0(names(openssl_config())),paste0(openssl_config()),sep="=",collapse="; ")))
 
  ####
  # Load our names lists, these tables where generated from a data source on the US Census website.
  reference.path <- .file.path(dirname(destinationPath),"referenceDataForAnonymization")
  # The list of surnames is very long, so the original file has been reduced to only the 20000 most common surnames in the US
  tblSurnames <- fread(.file.path(reference.path,"tblRandomNamesSurname1990.txt"))
  tblFemaleFirstNames <- fread(.file.path(reference.path,"tblRandomNamesFirstFemale1990.txt"))
  tblMaleFirstNames <- fread(.file.path(reference.path,"tblRandomNamesFirstMale1990.txt"))
  tblFirstNames <- rbind(tblFemaleFirstNames,tblMaleFirstNames)
  lstSurname <- tblSurnames$Surname
  lstFirstName <- c(tblFemaleFirstNames$FirstName,tblMaleFirstNames$FirstName)
  lstSurname.short <- unique(str_sub(lstSurname,1,5))
  lstFirstName.short <- unique(str_sub(lstFirstName,1,5))
  lstName.short <- unique(c(lstSurname.short,lstFirstName.short))

  suppressWarnings(dir.create(.file.path(destinationPath,"mask-str_length")))

  # get lists of files to process.
  # per current naming convention universes are prefixed infront of the .csv files...
 
  # RUN on ALL .CSV files in the sourcePath
  lapply(.normalize.path(list.files(sourcePath,full.names=TRUE,pattern="*.csv")),FUN=.sanitizeFiles)
  suppressWarnings(dir.create(destination.ResultsData_First2kLines))
  source(.file.path(dirname(destinationPath),"R Scripts","21_exploreSanatizedData-GenerateTruncateDataSets.R"))
  # Example For testing a single file
  # sourceFilePathToSanitize <- .file.path(sourcePath,"payroll.csv")
  #.sanitizeFiles(sourceFilePathToSanitize)
  }
}

TO DO: Unit tests!

Utility functions

Primary functions

Option to use numbers instead of hashes

Add option to use ordered integers for new ids rather than hashes

Update deidentify.R to read in / use existing crosswalk

Need

There needs to be a way to use an existing crosswalk so that unique IDs can be linked across discrete files. Right now, a crosswalk per file is created, which means that observations cannot be linked across files without combining multiple crosswalks together.

Example case

The example case is in panel data, where each file represents a unique time period, but individuals will be observed in more than one period.

TO DO: Vignette / GitHub page

Write vignette / GitHub page showing work flow.

Error in .hasNoSharing() : could not find function ".hasNoSharing"

Hi btskinner,

I am running into an error with deidenitfying the data. Any help would be much appreciated. Here is the code I have been using:

library(duawranglr)

## get crosswalk and admin data files
dua_cw_file <- system.file('extdata', 'dua_cw.csv', package = 'duawranglr')
admin_file <- system.file('extdata', 'admin_data.csv', package = 'duawranglr')

## set the DUA crosswalk
set_dua_cw(dua_cw_file)

set_dua_level('level_ii', deidentify_required = TRUE, id_column = 'CCCCO_Assigned')

## compare level II and III restrictions
see_dua_options(level = c('level_ii', 'level_iii'))

## deidentify data
Award_DB_14_20_tableau <- deid_dua(Award_DB_14_20_work, id_col = 'CCCCO_Assigned', new_id_name = 'id', write_crosswalk = TRUE, id_length = 20)

The last ## step is where i'm running into the error, do you have any suggestions?

best,
mark

btskinner / duawranglr Goto Github PK

duawranglr's Introduction

Packages

Course/lessons focused on coding/quantitative research

Replication files

General

duawranglr's People

Contributors

Stargazers

Watchers

Forkers

duawranglr's Issues

Feature Request

Utility functions

Primary functions

Need

Example case

Recommend Projects

Recommend Topics

Recommend Org