# Technical Analysis of Montana Voter Registration files: # * Statewide Voter File # * Voter History File # See # * https://app.mt.gov/voterfile/instruct.html # * https://app.mt.gov/voterfile/about.html # * https://app.mt.gov/voterfile/fees.html # NOTE: Online about.html above has Carter and Cascade county codes switched. # efg, 10 Sept. 2012 # Earl F Glynn # Franklin Center for Government and Public Integrity ################################################################################ ### Setup COUNTS <- "Counts" # Directory name for frequency count files basedir <- "F:/Voter-Registration/Montana/2012-02-09/" ### FIX ME EACH TIME ### setwd(basedir) FILENAME <- "MT_Export_051412.txt" # NOTE: Carter and Cascade county codes fixed. 12 Sept. 2012. # (Found when trying to match voter data to US Census data) County.Codes <- read.csv("Montana-County-Codes.csv", as.is=TRUE, row.names=1,colClasses="character") options(width=80) ################################################################################ # Statewide Voter File ################################################################################ ### Delimiter Check # Without "comment.char" override, records with # characters are problems. # Without "quote" override, records with apostrophe's or quotation marks # are problems. counts <- count.fields(FILENAME, sep="\t", comment.char="", quote="") table(counts) #counts # 36 37 # 1 633240 # The first row is OK, but all other rows end with a tab followed by # no data. This confuses R's read.delim, but we'll shift the headers. counts[1:2] #[1] 36 37 sum(table(counts)) #[1] 633241 ################################################################################ ### Read first few rows for overview of file MT5 <- read.delim(FILENAME, as.is=TRUE, nrows=5, row.names = NULL, # force row numbering quote="", comment.char="", colClasses="character") # Problem: column headers are shifted right with first column # erroneous labeld "row.names". Call last column "X" for now. colnames(MT5) <- c(colnames(MT5)[-1], "X") MT5 ################################################################################ ### Now that the first 5 rows look OK, let's get the complete file # Read complete file MT <- read.delim(FILENAME, as.is=TRUE, row.names = NULL, # force row numbering quote="", comment.char="", colClasses="character") dim(MT) #[1] 633240 37 # Fix column names colnames(MT) <- c(colnames(MT)[-1], "X") # Verify last column "X" is worthless table(MT$X) # Remove last column MT <- MT[,-ncol(MT)] dim(MT) #[1] 633240 36 ################################################################################ ### Verify all IDs are unique stopifnot(nrow(MT) == length(unique(MT$VTRID))) ################################################################################ ### Add County name field using hash lookup MT$County <- County.Codes[MT$CURRENT_COUNTY,1] ################################################################################ ### Compute voter totals by county # Form cross tab that we want. # Delete first row with the header. counts <- as.data.frame.matrix(table(MT$County,MT$VOTER_STATUS)) x <- data.frame(County=rownames(counts), Active=counts$Active, Inactive=counts$Inactive, Pending=counts$Pending, Provisional=counts$Provisional, Total=counts$Active+counts$Inactive+counts$Pending+counts$Provisional, stringsAsFactors=FALSE) # Compute state totals x <- rbind(x, data.frame(County="TOTAL", Active=sum(x$Active), Inactive=sum(x$Inactive), Pending=sum(x$Pending), Provisional=sum(x$Provisional), Total=sum(x$Total))) # Write file write.csv(x, paste("MT-voter-status.csv", sep=""), row.names=FALSE) x <- as.data.frame.matrix(table(MT$VOTER_STATUS_REASON, MT$VOTER_STATUS)) write.csv(x, paste("MT-voter-status-reasons.csv", sep="")) ################################################################################ # Create COUNTS directory if it does not exist. if (! file.exists(COUNTS)) { dir.create(COUNTS) } ################################################################################ # Look at each colum and create descriptive stats of data field.summary <- NULL for (i in 1:length(colnames(MT))) { column <- colnames(MT)[i] field <- MT[,i] counts <- table(field) counts <- data.frame(as.table(counts)) colnames(counts) <- c(column,"Count") COUNTS.filename <- paste(COUNTS, "/", sprintf("%02d",i),"-", column, ".csv",sep="") write.csv(counts, file=COUNTS.filename, row.names=FALSE) cat(i, column, "\n") flush.console() Missing <- sum(nchar(field) == 0) N <- length(field) - Missing NUnique <- length(unique(field)) zero.length.field <- (nchar(field) == 0) # should trim char strings? NCounty <- length(unique(MT$County[!zero.length.field])) field.summary <- rbind(field.summary, c(i, column, as.vector(summary( nchar(field[!zero.length.field]) ) ), N, Missing, NUnique, NCounty) ) } colnames(field.summary) <- c("No", "Field", "Min", "Q1", "Median", "Mean", "Q3", "Max", "N", "Missing", "NUnique", "NCounty") write.csv(field.summary, file="MT-FieldLengthSummary.csv", row.names=FALSE) ################################################################################ # Voter History File ################################################################################ FILENAME <- "MT_VH_Export_051412.txt" # Verify same number of tokens on each line counts <- count.fields(FILENAME, sep="\t", comment.char="", quote="") table(counts) h <- read.delim(FILENAME, as.is=TRUE, row.names = NULL, # force row numbering quote="", comment.char="", colClasses="character") dim(h) #[1] 5285042 9 # How many voters have history? length(unique(h$Voter.ID)) #[1] 602605 # How many are for history in voter file length(intersect(MT$VTRID, h$Voter.ID)) #[1] 591276 # How many voters have history but no voter file entry? length(setdiff(h$Voter.ID, MT$VTRID)) #[1] 11329 # Fraction of voter file with voter history length(intersect(MT$VTRID, h$Voter.ID)) / nrow(MT) #[1] 0.9337313 # On average number of ballots per voter nrow(h) / length(unique(h$Voter.ID)) #[1] 8.770326 # Convert county number to name h$County <- County.Codes[h$js_code,1] table(h$js_code) # Ballot history by county counts <- data.frame(table(h$js_code)) colnames(counts) <- c("js_code", "Frequency") COUNTS.filename <- paste(COUNTS, "/js_code.csv", sep="") write.csv(counts, file=COUNTS.filename, row.names=FALSE) counts <- data.frame(table(h$County, useNA="always")) colnames(counts) <- c("ballot_county", "Frequency") COUNTS.filename <- paste(COUNTS, "/ballot_county.csv", sep="") write.csv(counts, file=COUNTS.filename, row.names=FALSE) # election_type counts <- data.frame(table(h$election_type)) colnames(counts) <- c("election_type", "Frequency") COUNTS.filename <- paste(COUNTS, "/Election_type.csv", sep="") write.csv(counts, file=COUNTS.filename, row.names=FALSE) # voting method counts <- data.frame(table(h$vvm_id)) colnames(counts) <- c("VotingMethod", "Frequency") COUNTS.filename <- paste(COUNTS, "/voting-method.csv", sep="") write.csv(counts, file=COUNTS.filename, row.names=FALSE) # Election Dates # Verify times in Election_date have no information; chop the time off table(nchar(h$Election_date)) # 22 #5285042 table(substr(h$Election_date, 12,22)) #12:00:00 AM # 5285042 h$Election_date <- substr(h$Election_date, 1, 10) # Use ISO8601 format: YYYY-MM-DD h$Election_date <- as.character(strptime(h$Election_date, "%m/%d/%Y")) # Frequency by Election Date counts <- data.frame(table(h$Election_date)) colnames(counts) <- c("Election_date", "Frequency") COUNTS.filename <- paste(COUNTS, "/Election_date.csv", sep="") write.csv(counts, file=COUNTS.filename, row.names=FALSE) Voter.List <- list(h$Voter.ID) # Ballots per voter Ballots.By.Voter <- aggregate(h[,c(1,3)], by=Voter.List, length) Ballots.By.Voter <- Ballots.By.Voter[,1:2] colnames(Ballots.By.Voter) <- c("Voter.ID", "Ballots") # Look at last election for voter # Unclear why this is much faster with extra column 1 here. Last.Election.By.Voter <- aggregate(h[,c(1,3)], by=Voter.List, max) Last.Election.By.Voter <- Last.Election.By.Voter[,c(2,3)] election.stats <- merge(Ballots.By.Voter, Last.Election.By.Voter) colnames(election.stats)[3] <- "LastElection" dim(election.stats) #[1] 602605 3 # Add Ballots and LastElection to voter file MT <- merge(MT, election.stats, by.x="VTRID", by.y="Voter.ID", all.x=TRUE) dim(MT) SELECT <- (MT$LastElection < "2007") SELECT[is.na(SELECT)] <- FALSE Last.Voted.2006.or.before <- MT[SELECT, c("VTRID", "FIRSTNAME", "MIDDLENAME", "LASTNAME", "VOTER_STATUS", "County", "Ballots", "LastElection")] dim(Last.Voted.2006.or.before) #[1] 11445 8 write.csv(Last.Voted.2006.or.before, "Last-Voted-2006-or-before.csv", row.names=FALSE) table(Last.Voted.2006.or.before$VOTER_STATUS) # Active Inactive Provisional # 2324 9111 10 counts <- as.data.frame.matrix(table(Last.Voted.2006.or.before$County, Last.Voted.2006.or.before$VOTER_STATUS)) COUNTS.filename <- paste(COUNTS, "/Voter-Last-Voted-2006-Voter-Status-by-County.csv", sep="") write.csv(counts, file=COUNTS.filename) # Could look at stats of mail ballots sent but not received. ################################################################################ write.table(MT, "MT-Voter-File-With-History-Stats.txt", sep="\t", quote=FALSE, row.names=FALSE)