# Scrape HTML web pages of Virginia voter registration data from: # http://www.sbe.virginia.gov/cms/Statistics_Polling_Places/Index.html # Franklin Center for Government & Public Integrity # Earl F Glynn # 22 August 2012. ################################################################################ ### Setup library(XML) setwd("F:/Voter-Registration/Virginia/census-registration-comparison/Board-of-Elections/") ################################################################################ ### Helper functions fix.table <- function(raw, N) { x <- data.frame(raw[[N]], stringsAsFactors=FALSE) # Convert to character strings, parameter above ignored x <- data.frame(lapply(x, as.character), stringsAsFactors=FALSE) x <- x[, c(-1,-9:-13)] # delete selected columns # Kludge to recover statewide totals total.index <- which(x$V2 == "Statewide Totals:") if (length(total.index) > 0) { total.index <- total.index - 1 x[total.index, 7] <- x[total.index, 5] x[total.index, 6] <- x[total.index, 4] x[total.index, 4] <- x[total.index, 3] x[total.index, 3] <- x[total.index, 2] x[total.index, 2] <- x[total.index, 1] x[total.index, 1] <- x[total.index+1, 1] } # now get rid of column 5 (had state total) x <- x[,-5] # Unclear why a column 7 is sometimes present with NAs, but get rid of it if (ncol(x) == 7) x <- x[,-7] x <- x[complete.cases(x),] colnames(x) <- c("Locality", "Precincts", "Active", "Inactive", "Overseas", "Total") # Final Kludge if ((x[2,1] == "Active Voters")) x <- x[-1:-2,] x } grab.web.page <- function(URL) { raw <- readHTMLTable(URL) s <- as.character(raw[[4]]$V1) Date <- unlist(strsplit(s, " - "))[2] # Convert from "month d, yyyy" to "yyyy-mm-dd" Date <- as.Date(Date,"%b %d, %Y") part1 <- fix.table(raw, 9) part2 <- fix.table(raw, 16) part3 <- fix.table(raw, 23) part4 <- fix.table(raw, 30) # Combine three parts d <- rbind(part1, part2, part3, part4) # Add date d <- data.frame(Date, d, stringsAsFactors=FALSE) # Cleanup d$Locality <- gsub("\r\n","", d$Locality) d$Locality <- gsub("Statewide Totals:","TOTAL", d$Locality) d$Precincts <- gsub(",","", d$Precincts) d$Active <- gsub(",","", d$Active) d$Inactive <- gsub(",","", d$Inactive) d$Overseas <- gsub(",","", d$Overseas) d$Total <- gsub(",","", d$Total) # Make sure all rows are present stopifnot(nrow(d) == 135) # Make sure all Locality rows add up to TOTAL row stopifnot(apply(data.matrix(d[-nrow(d),-1:-2]), 2, sum) == as.numeric(d[nrow(d),-1:-2])) d } ################################################################################ base.URL <- "http://www.sbe.virginia.gov/cms/Statistics_Polling_Places/Registration_Statistics/" ### Early 2003 and 2004 files follow very different "rules". ### Gave up trying to integrate rules needed for 2003/2004 files with later files. #URL <- paste(base.URL, "2003/County_City/Number_of_Precincts_Registered_Voters_by_County_City_-_January_1,_2003.html", sep="") #URL <- paste(base.URL, "2004/County_City/Number_of_Precincts_Registered_Voters_by_County_City_-_January_1,_2004.html", sep="") pages <- c("2004/County_City/Number_of_Precincts_Registered_Voters_by_County_City_-_July_1,_2004.html", "2004/County_City/Number_of_Precincts_Registered_Voters_by_County_City_for_the_November_General_Election_2004.html", ## "2005/County_City/Counties_Cities_-_January_1,_2005.html", "2005/County_City/Counties_Cities_-_July_1,_2005.html", "2006/County_City/Counties_Cities_-_January_1,_2006.html", "2006/County_City/Counties_Cities_-_April_1,_2006.html", "2006/County_City/Counties_Cities_-_July_1,_2006.html", "2006/County_City/Counties_Cities_-_November_1,_2006.html") VA.registration <- NULL for (i in 1:length(pages)) { URL <- paste(base.URL, pages[i], sep="") time.point <- grab.web.page(URL) # KLUDGE fix for irregularity in file if (i == 2) time.point$Date <- as.Date("2004-10-04") cat(as.character(time.point$Date[1]), "\n") flush.console() VA.registration <- rbind(VA.registration, time.point) } write.csv(VA.registration, "Virginia-Voter-Registration-2004-2006.csv", row.names=FALSE)