# Combine county voter counts from various Missouri voter data files # Franklin Center for Government & Public Integrity # Earl F Glynn # efg, 29 July 2012 ################################################################################ ### Setup County.Count <- 114 setwd("F:/Voter-Registration/Missouri/Census-Registration-Comparison/Voter-Files/") ################################################################################ # Silently remove output files that will be created below # (avoids problem trying to use out files as input) file.remove("Missouri-County-Voter-Counts-2004-2012-Files-Active.csv") file.remove("Missouri-County-Voter-Counts-2004-2012-Files-Total.csv") # Make list of files to process files <- list.files(path=".", pattern=glob2rx("*.csv")) # One row per county, with totals at bottom # +3: KC + St. Louis City + TOTAL line voters.active <- data.frame(matrix(0, County.Count+3, length(files))) voters.total <- data.frame(matrix(0, County.Count+3, length(files))) # Use last file to use county names to define row names. # Biggerst difference expected with first file. d1 <- read.csv(files[1], as.is=TRUE) d2 <- read.csv(files[length(files)], as.is=TRUE) # Standardize to names used by U.S. Census # File d1 county names to conform to the latest d1$County <- gsub("Dekalb|De Kalb", "DeKalb", d1$County) d1$County <- gsub("Mcdonald", "McDonald", d1$County) d1$County[d1$County == "St. Louis City"] <- "St. Louis city" d1$County[d1$County == "St. Louis"] <- "St. Louis County" # Use hashing lookup so St. Louis/St. Louis City order does not matter. d1$County[which(d1$County != d2$County)] d2$County[which(d1$County != d2$County)] counties <- d1$County rownames(voters.active) <- d1$County rownames(voters.total) <- d1$County # Go through all files filling in matrix using # county name as hash lookup. for (i in 1:length(files)) { d <- read.csv(files[i], as.is=TRUE) # Standardize all to U.S. Census spelling d$County <- gsub("Dekalb|De Kalb", "DeKalb", d$County) d$County <- gsub("Mcdonald", "McDonald", d$County) d$County[d$County == "St. Louis City"] <- "St. Louis city" d$County[d$County == "St. Louis"] <- "St. Louis County" # Grab yyyy-mm-dd from filename and format as # Dyyyy.mm.dd to keep R happy name <- gsub("-","\\.", paste("D", substr(files[i],1,10), sep="")) colnames(voters.active)[i] <- name colnames(voters.total)[i] <- name voters.active[d$County,i] <- d$Active voters.total[d$County,i] <- d$Total } # Use "TOTAL" rowname for use with census callback plot routine rownames(voters.active)[nrow(voters.active)] <- "TOTAL" rownames(voters.total)[nrow(voters.total)] <- "TOTAL" # KLUDGE # Zero state totals when counties are missing in 2006 voters.active[nrow(voters.active), 2] <- 0 voters.total[nrow(voters.total), 2] <- 0 # Set 0s to NA's voters.active[voters.active == 0] <- NA voters.total[voters.total == 0] <- NA # Remove all periods from "St." and "Ste." in county names rownames(voters.active) <- gsub("\\.", "", rownames(voters.active)) rownames(voters.total) <- gsub("\\.", "", rownames(voters.total)) # Write files write.csv(voters.active, "Missouri-County-Voter-Counts-2004-2012-Files-Active.csv") write.csv(voters.total, "Missouri-County-Voter-Counts-2004-2012-Files-Total.csv")