When I need to review large data sets with many tables and columns it would be helpful if there was a function that would suggest the columns and tables that may contain likely PII content. Several methods of doing this are in the following script (this is a get'r done script, and would need to be re-built for this purpose, to suggest, instead of sanitize everything)
# License for this script: [CC0 1.0 Universal (CC0 1.0) Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/)
library(tidyverse) #data wrangling
library(data.table) #data wrangling
library(dplyr) #data wrangling
library(lubridate) #handle dates
library(stringr)# https://evoldyn.gitlab.io/evomics-2018/ref-sheets/R_strings.pdf
library(loggit) # for simple logging
library(openssl) # # Used to generate hash values
### Generic functions !!! ###
.get.random.character <- function(...,percentUpper=.4,percentLower=.4,percentNumber=.1, percentSymbol=.1){
# Should never return "_"
percentUpper <- percentUpper /(percentUpper + percentLower + percentNumber + percentSymbol)
percentLower <- percentLower /(percentUpper + percentLower + percentNumber + percentSymbol)
percentNumber <- percentNumber /(percentUpper + percentLower + percentNumber + percentSymbol)
percentSymbol <- percentSymbol /(percentUpper + percentLower + percentNumber + percentSymbol)
# Using the restricted symbols set used by the LASTPASS random password generator
SYMBOLS <- c("!","&","%","#","$","@","*","^")
.randomNumber <- rand_num(n=1)
ifelse(
.randomNumber < percentUpper,
LETTERS[as.integer(rand_num(n=1)*26+1)],
ifelse(
.randomNumber < (percentUpper+percentLower),
letters[as.integer(rand_num(n=1)*26+1)],
ifelse(
.randomNumber < (percentUpper+percentLower+percentNumber),
as.integer(rand_num(n=100)+1),
ifelse(
.randomNumber < (percentUpper+percentLower+percentNumber+percentSymbol),
SYMBOLS[as.integer(rand_num(n=1)*length(SYMBOLS)+1)],
"_"
)
)
)
)
}
.generate.random.string <- function(n=1,UsePseudoRandomSeed=TRUE){
# this should set a new seed every time it's called based on time script run and sessions PID
if(UsePseudoRandomSeed){
set.seed(as.numeric(Sys.time(), digits=17)*10^8-(as.integer(as.numeric(Sys.time(), digits=17))*10^8)+Sys.getpid())
}
paste(
unlist(lapply(X=1:n,FUN=.get.random.character)),
collapse=""
)
}
### Initalization functions
.normalize.path <- function(
somePath,
winslash=if (.Platform$OS.type == "windows") {
fileSeperator <- "\\"
} else {
fileSeperator <- .Platform$file.sep
},
mustWork=FALSE
){
# was having many issues using winslash=.Platform$file.sep, switching to winslash="\\\\" works
normalizePath(somePath, winslash=winslash, mustWork=mustWork)
}
.file.path<-function(
...,
fsep=if (.Platform$OS.type == "windows") {
fileSeperator <- "\\"
} else {
fileSeperator <- .Platform$file.sep
}
){
# was having many issues using fsep=.Platform$file.sep, switching to fsep="\\\\" works
.normalize.path(.Internal(file.path(list(...), fsep)))
}
## Script unique functions ###
# only looking at columns that contain a string longer than 7 characters
# the average character of surname or firstname is roughly 6 and one standard deviation is roughly 1.5
# so if a column only had 7 first or last names in it, it's highly likely that at least one would be longer than 8 charaters
.validate.is.short.name.string <- function(x){
maxChars <- max(nchar(x))
return(is.character(x) && maxChars > 7)
}
.choose_directory <- function(caption= 'Select data directory') {
# .choose_directory() is modified from https://stackoverflow.com/a/48243694
if (exists('utils::choose.dir')) {
utils::choose.dir(caption = caption)
} else {
tcltk::tk_choose.dir(caption = caption)
}
}
.sanitizeFiles <- function(sourceFilePathToSanitize){
fCancel <- FALSE
# Get data set
attempt <- tryCatch({
dataSet <- fread(sourceFilePathToSanitize)
if(nrow(dataSet)<=1){
message("no data found::skipped file transfer for:",basename(sourceFilePathToSanitize))
fCancel <- TRUE
}
}, error = function(e) {
message(
"Warning :: resuming - next file -> failed to read file: ",
basename(sourceFilePathToSanitize),
"" # "\n Here's the original error message:\n", e
)
fCancel <- TRUE
})
if(!fCancel){
attempt <- tryCatch({
fwrite(
x=data.table(apply(X=dataSet, MARGIN=2, FUN=str_length)),
file=.file.path(destinationPath,"mask-str_length",basename(sourceFilePathToSanitize))
)
}, error = function(e) {
message(
"Warning :: resuming - even though -> failed to write length mask file: ",
basename(sourceFilePathToSanitize),
"" # "\n Here's the original error message:\n", e
)
})
attempt <- tryCatch({
# a little prep :: trim everything that's a string
dataSet.chars <- dataSet %>% Filter(f=is.character)
if(length(names(dataSet.chars ))>0){
dataSet.chars <- lapply(X=dataSet.chars, FUN=str_trim)
dataSet <- dplyr::bind_cols(
select(dataSet,-data.frame(names(dataSet.chars))[[1]]),
dataSet.chars
)
}
}, error = function(e) {
message(
"Warning :: resuming - even though -> failed to apply trim on character columns: ",
basename(sourceFilePathToSanitize),
"" # "\n Here's the original error message:\n", e
)
})
attempt <- tryCatch({
### The memo field hatchet - broadly hash all possible memo field by hashing all long text columns, see requirements documentation to enhance this
#1- Prep/find: Hash columns of text over 22 characters in length for text fields that could contain memo fields
dataSetFiltered <- dataSet %>%
Filter(f=function(x) is.character(x) && max(nchar(x)) > 22)
#3- Clean: Replace a zero length strings with NA (So we don't hash them)
dataSetFiltered <- mutate_all(dataSetFiltered, list(~na_if(.,"")))
#4- Process: Hash everything found
dataSetFiltered <-mutate_all(dataSetFiltered, list(~.hash(.)))
#5- Replace: updated column(s)
columnNamesFiltered <- data.frame(names(dataSetFiltered))
dataSetResults <- dplyr::bind_cols(select(dataSet,-(columnNamesFiltered[[1]])), dataSetFiltered)
# Clean as we go...
suppressWarnings(rm("dataSetFiltered"))
#suppressWarnings(gc(verbose=TRUE,reset=TRUE))
}, error = function(e) {
message(
"Warning :: resuming - even though -> failure durring find/processsing of long text columns: ",
basename(sourceFilePathToSanitize),
"" # "\n Here's the original error message:\n", e
)
})
attempt <- tryCatch({
### SSN/EmployeeId/PhoneNumbers
#1- Prep: removed all these characters from our search set for SSN, EmployeeId, PhoneNumbers searches: '-','(',')','\','/'
dataSetSearchChar <- dataSet %>%
Filter(f=is.character) %>%
select(-(columnNamesFiltered[[1]]))
dataSetSearchChar <- mutate_all(dataSetSearchChar, list(~str_remove_all(string=.,"[\\/: \\-\\(\\),\\.\\/\\\\]")))
#2- Clean replace a zero lenth strings with NA (So we don't search or hash them)
dataSetSearchChar <- mutate_all(dataSetSearchChar, list(~na_if(.,"")))
# set NA mask for counting value is present
dataSetSearchFilteredCheckIsNotNa <-
mutate_all(dataSetSearchChar , list(~suppressWarnings(!is.na(.))))
# set string character length mask
# remove all NAs for further processing/searching
dataSetSearchChar[is.na(dataSetSearchChar)] <- ""
#3- Find likely/possible columns
dataSetCharacterLengthMask <- mutate_all(dataSetSearchChar, list(~str_length(.)))
#colSums(dataSetCharacterLengthMask)
#NOT USED: colMeans(dataSetCharacterLengthMask ) # Don't use colMeans here as we would be inlcuding all NAs/empties, so do the math
dataSetMeanStringLength <- colSums(dataSetCharacterLengthMask)/colSums(dataSetSearchFilteredCheckIsNotNa)
dataSetSearchCharCheckForNumeric <-
mutate_all(dataSetSearchChar, list(~str_sub(string=.,start=1,end=4)))
dataSetSearchCharCheckForNumeric <-
mutate_all(dataSetSearchCharCheckForNumeric , list(~suppressWarnings(!is.na(as.numeric(.)))))
#Find columns with a percent
percentValuesNumeric <-
(
colSums(dataSetSearchCharCheckForNumeric)/colSums(dataSetSearchFilteredCheckIsNotNa)
)
# filter to mostly numerics for the SSN/EmployeeId/PhoneNumber search
# only look at mostly numeric values after removing dash,hyphen ect. of the correct length...
dataSetCharWithNumericColumns <- names(percentValuesNumeric[percentValuesNumeric > .6])
dataSetMeanStringLength.ssn <- dataSetMeanStringLength[dataSetMeanStringLength < 9.1]
dataSetMeanStringLength.ssn <- dataSetMeanStringLength.ssn[dataSetMeanStringLength.ssn > 8.9 ]
dataSetPotentialSsnColumns <- names(dataSetMeanStringLength.ssn)
# APPLY filter for mostly numeric on search /found columns...
dataSetPotentialSsnColumns <- intersect(dataSetPotentialSsnColumns ,dataSetCharWithNumericColumns)
# [TODO] before assuming a column is a SSN should check that no values (no more that a couple of fake user accounts)
# that are 9 characters long contain leading, trailing or mid 0's,
dataSetPotentialPhoneEmployeeIdColumns <-
dataSetMeanStringLength %>%
Filter(f=function(x) x < 10.0001 && x > 9.91) %>%
data.frame() %>%
row.names()
dataSetPotentialPhoneEmployeeIdColumns <- intersect(dataSetPotentialPhoneEmployeeIdColumns ,dataSetCharWithNumericColumns)
# check the rest of the columns for potential SSN/EmployeeId columns
dataSetSearchNotChar <- dataSet %>%
select(-(columnNamesFiltered[[1]])) %>%
select(-(data.frame(names(dataSetSearchChar))[[1]]))
dataSetSearchMaskIsNotZero <- mutate_all(dataSetSearchNotChar, list(~!(.) == 0 ))
.is.ssn.EmployeeId.Range <- function(x){
# SSNs are 9 digits long | {OR} employee IDs that are 10 digits long and begin with 1
(x > (10^8-1) & x < (10^9-1)) | (x > (10^9 * 1 - 1) & x < (10^9 * 2 - 1))
}
dataSetSearchMaskIsSsnEmployeeIdRange <- mutate_all(dataSetSearchNotChar, list(~(.is.ssn.EmployeeId.Range(x=.))))
# Percent of column that contains ssn or EmployeeId range values (excluding 0 and na values)
dataSetSearchPercentIsSsnEmployeeIdRange <- colSums(dataSetSearchMaskIsSsnEmployeeIdRange) / colSums(dataSetSearchMaskIsNotZero)
# Add newly found columns
dataSetPotentialPhoneEmployeeIdColumns <- dataSetPotentialPhoneEmployeeIdColumns %>% union(
dataSetSearchPercentIsSsnEmployeeIdRange %>%
Filter(f=function(x) x <= 1 && x >.9 ) %>%
names()
)
# Not using {Mean search = Sum of columns / Count of non-zero values} percent of those that fall in range is better...
# NOT RUN: dataSetMeanSearchNotChar <- colSums(dataSetSearchNotChar) / colSums(dataSetSearchMaskIsNotZero)
if(length(c(dataSetPotentialSsnColumns,dataSetPotentialPhoneEmployeeIdColumns))>0){
dataSetFiltered <- dataSetResults %>% select(data.frame(c(dataSetPotentialSsnColumns,dataSetPotentialPhoneEmployeeIdColumns))[[1]])
dataSetFiltered <- mutate_all(dataSetFiltered, list(~na_if(.,"")))
#4- Process: Hash everything found
dataSetFiltered <-mutate_all(dataSetFiltered, list(~.hash(.)))
#5- Replace: updated column(s)
columnNamesFiltered.ssn <- data.frame(names(dataSetFiltered))
dataSetResults <- dplyr::bind_cols(select(dataSetResults,-(columnNamesFiltered.ssn[[1]])), dataSetFiltered)
columnNamesFiltered <- suppressWarnings(bind_rows(columnNamesFiltered, columnNamesFiltered.ssn))
# Clean as we go...
suppressWarnings(rm("dataSetFiltered"))
suppressWarnings(gc(verbose=TRUE,reset=TRUE))
}
}, error = function(e) {
message(
"Warning :: resuming - even though -> failure durring SSN/EmployeeId/PhoneNumbers processing: ",
basename(sourceFilePathToSanitize),
"" # "\n Here's the original error message:\n", e
)
})
attempt <- tryCatch({
### NAMES, first or last beginnig matches
#1/2- Prep/Clean: Apply column filter 1
dataSetSearchChar <- dataSetSearchChar %>% select(-data.frame(c(dataSetPotentialSsnColumns,dataSetPotentialPhoneEmployeeIdColumns))[[1]])
dataSetCharWithTextColumns <- names(percentValuesNumeric[percentValuesNumeric<=.5])
#3- Find: looking for names columns, so charaters, not begining with numbers, with character lengths with mean between 4 and 7, or > 10 (first, middle and last... could be more)
dataSetPotentialNameColumns <- names(dataSetMeanStringLength %>% Filter(f=function(x) x < 7 && x > 4 || x > 10 ))
dataSetPotentialNameColumns <-
intersect(
dataSetPotentialNameColumns,
intersect(
names(dataSetSearchChar),
dataSetCharWithTextColumns
)
)
dataSetSearchChar <- dataSetSearchChar %>% select(data.frame(dataSetPotentialNameColumns)[[1]])
dataSetSearchForName <- mutate_all(dataSetSearchChar, list(~str_to_upper(str_sub(string=.,start=1,end=5))))
dataSetSearchForNameFound <-
mutate_all(dataSetSearchForName , list(~(.) %in% lstName.short))
dataSetFilteredIsNotNaSums <- dataSetSearchFilteredCheckIsNotNa %>% select(data.frame(names(dataSetSearchForName))[[1]]) %>% colSums()
dataSetFiltered.name.percent <- colSums(dataSetSearchForNameFound)/dataSetFilteredIsNotNaSums
#Filter on >15% of present values start 5 chars with a match to a name
dataSetFiltered.name.percent <- dataSetFiltered.name.percent[dataSetFiltered.name.percent>.15]
dataSetFiltered <- dataSetResults %>% select(names(dataSetFiltered.name.percent))
#3- Clean: Replace a zero lenth strings with NA (So we don't hash them)
dataSetFiltered <- mutate_all(dataSetFiltered, list(~na_if(.,"")))
#4- Process: Hash everything found
dataSetFiltered <-mutate_all(dataSetFiltered, list(~.hash(.)))
#5- Replace: updated column(s)'
# Add these to existing list
columnNamesFilteredName <- data.frame(names(dataSetFiltered))
dataSetResults <- dplyr::bind_cols(select(dataSetResults,-(columnNamesFilteredName[[1]])), dataSetFiltered)
# Keeping track of all columns updated
columnNamesFiltered <- suppressWarnings(bind_rows(columnNamesFiltered,columnNamesFilteredName))
}, error = function(e) {
message(
"Warning :: resuming - even though -> failure durring find Person Names processing: ",
basename(sourceFilePathToSanitize),
"" # "\n Here's the original error message:\n", e
)
})
attempt <- tryCatch({
### Wrap up and Write out our results
destinationFileName <- .file.path(destinationPath, basename(sourceFilePathToSanitize))
appemptDeleteFilePriorToWriting <- suppressWarnings(file.remove(destinationFileName ))
destinationWasAvailable <- ! file.exists(destinationFileName) && dir.exists(dirname(destinationFileName))
fwrite(x=dataSetResults,file=destinationFileName )
message(
if(destinationWasAvailable && file.exists(destinationFileName)){
paste0(
"Replaced values in file <",basename(sourceFilePathToSanitize),
"> with hash of values from fields: ",paste(unique(columnNamesFiltered[[1]]),collapse=" | ")
)
}else{
paste0("ERROR: Check Destination: Failed to write file:",sourceFilePathToSanitize)
}
)
}, error = function(e) {
message(
"Warning :: resuming - even though -> failure durring write results: ",
basename(sourceFilePathToSanitize),
"" # "\n Here's the original error message:\n", e
)
})
}
# Cleanup to free up RAM
suppressWarnings(rm(list=ls(pattern="^data\\.set")))
suppressWarnings(gc(verbose=TRUE,reset=TRUE))
}
### Execute script ###
### Set Configuration variables ###
suppressWarnings(rm(list=c("sourcePath","destinationPath","my.salt")))
configPath <- .file.path(Sys.getenv("R_CUSTOM_INITILIZATION_PATH"),"config","sanitization.R")
fSourceExists <- FALSE
fCancel <- FALSE
if(file.exists(configPath)){
source(configPath)
if(exists("sourcePath")){
if(dir.exists(sourcePath)){
fSourceExists <- TRUE
}
}
}
if(!fSourceExists){
sourcePath <- .choose_directory(caption="Select the Source Directory")
if(is.na(sourcePath)){
fCancelMessage("User canceled selecting 'Source Path': Exiting script")
fCancel <- TRUE
}
}
if(!fCancel){
fDestinationExists <- FALSE
if (exists("destinationPath")){
suppressWarnings(dir.create(destinationPath))
# [TODO] add interactive selection method for linux systems
if(dir.exists(sourcePath)){
fDestinationExists <- TRUE
}
}
if(!fDestinationExists){
destinationPath <- .choose_directory(caption="Select the Destination Directory")
if(is.na(sourcePath)){
message("User canceled selecting 'Destination Path': Exiting script")
fCancel <- TRUE
}
}
}
if(!fCancel){
# Logging! Every message, warning, and stop will now also write to this path ...
loggit::set_logfile(.file.path(destinationPath,"cleanupData-Log.ndjson"))
# a file that contains only a secret passphrase
if(file.exists(configPath)){
if(file.exists(.file.path(dirname(configPath),"my_secret_salt"))){
my.salt <- fread(.file.path(dirname(configPath),"my_secret_salt"),header=FALSE)[[1]]
}
}
if(!exists("my.salt")){
#[TODO] use askpass, instead of realine for a maked passphrase if it is available
my.salt <- readline(prompt=
paste0(
"Type a passphrase to salt your hashes with and hit {Enter}, \n ",
"if left blank a random 32 character string will be used:\n"
)
)
if(!exists("my.salt")){
my.salt <- .generate.random.string(32)
}
if(is.na(my.salt)){
my.salt <- .generate.random.string(32)
}
if(str_length(my.salt) == 0) {
my.salt <- .generate.random.string(32)
}
}
if(!exists("my.salt")){
fCancel <- TRUE
}
if(fCancel || str_length(my.salt)==0){
fCancel<-TRUE
}
}
.hash <- function(x){
# using sha256 with cusom salt for strength, and then truncate to 32 characters 128bits to reduce size (stronger than md5, same storage)...
attempt <- try(str_sub(sha256(as.character(x),my.salt),end=32))
# NULL and NA, or non-characters return NULL
if(class(attempt)=="try-error"){return(NULL)}else{return(attempt)}
}
if(!fCancel){
### Begin Processing ###
gc(verbose=TRUE,reset=TRUE)
# log start and machine/config info
message(paste0(
"Begin PII/Memo sanitization to ",Sys.getenv("NUMBER_OF_PROCESSORS"), " processors and ",
round( memory.limit()/2^10,2) ," GB RAM and OpenSSL with:",
paste(paste0(names(openssl_config())),paste0(openssl_config()),sep="=",collapse="; ")))
####
# Load our names lists, these tables where generated from a data source on the US Census website.
reference.path <- .file.path(dirname(destinationPath),"referenceDataForAnonymization")
# The list of surnames is very long, so the original file has been reduced to only the 20000 most common surnames in the US
tblSurnames <- fread(.file.path(reference.path,"tblRandomNamesSurname1990.txt"))
tblFemaleFirstNames <- fread(.file.path(reference.path,"tblRandomNamesFirstFemale1990.txt"))
tblMaleFirstNames <- fread(.file.path(reference.path,"tblRandomNamesFirstMale1990.txt"))
tblFirstNames <- rbind(tblFemaleFirstNames,tblMaleFirstNames)
lstSurname <- tblSurnames$Surname
lstFirstName <- c(tblFemaleFirstNames$FirstName,tblMaleFirstNames$FirstName)
lstSurname.short <- unique(str_sub(lstSurname,1,5))
lstFirstName.short <- unique(str_sub(lstFirstName,1,5))
lstName.short <- unique(c(lstSurname.short,lstFirstName.short))
suppressWarnings(dir.create(.file.path(destinationPath,"mask-str_length")))
# get lists of files to process.
# per current naming convention universes are prefixed infront of the .csv files...
# RUN on ALL .CSV files in the sourcePath
lapply(.normalize.path(list.files(sourcePath,full.names=TRUE,pattern="*.csv")),FUN=.sanitizeFiles)
suppressWarnings(dir.create(destination.ResultsData_First2kLines))
source(.file.path(dirname(destinationPath),"R Scripts","21_exploreSanatizedData-GenerateTruncateDataSets.R"))
# Example For testing a single file
# sourceFilePathToSanitize <- .file.path(sourcePath,"payroll.csv")
#.sanitizeFiles(sourceFilePathToSanitize)
}
}