# Scrape HTML web pages of Virginia voter registration data from:
# http://www.sbe.virginia.gov/cms/Statistics_Polling_Places/Index.html
# Franklin Center for Government & Public Integrity
# Earl F Glynn
# 22 August 2012.
################################################################################
### Setup
library(XML)
setwd("F:/Voter-Registration/Virginia/census-registration-comparison/Board-of-Elections/")
################################################################################
### Helper functions
fix.table <- function(raw, N)
{
x <- data.frame(raw[[N]], stringsAsFactors=FALSE)
# Convert to character strings, parameter above ignored
x <- data.frame(lapply(x, as.character), stringsAsFactors=FALSE)
x <- x[, c(-1,-9:-13)] # delete selected columns
# Kludge to recover statewide totals
total.index <- which(x$V2 == "Statewide Totals:")
if (length(total.index) > 0)
{
total.index <- total.index - 1
x[total.index, 7] <- x[total.index, 5]
x[total.index, 6] <- x[total.index, 4]
x[total.index, 4] <- x[total.index, 3]
x[total.index, 3] <- x[total.index, 2]
x[total.index, 2] <- x[total.index, 1]
x[total.index, 1] <- x[total.index+1, 1]
}
# now get rid of column 5 (had state total)
x <- x[,-5]
# Unclear why a column 7 is sometimes present with NAs, but get rid of it
if (ncol(x) == 7) x <- x[,-7]
x <- x[complete.cases(x),]
colnames(x) <- c("Locality", "Precincts", "Active", "Inactive", "Overseas", "Total")
# Final Kludge
if ((x[2,1] == "Active Voters")) x <- x[-1:-2,]
x
}
grab.web.page <- function(URL)
{
raw <- readHTMLTable(URL)
s <- as.character(raw[[4]]$V1)
Date <- unlist(strsplit(s, " - "))[2]
# Convert from "month d, yyyy" to "yyyy-mm-dd"
Date <- as.Date(Date,"%b %d, %Y")
part1 <- fix.table(raw, 9)
part2 <- fix.table(raw, 16)
part3 <- fix.table(raw, 23)
part4 <- fix.table(raw, 30)
# Combine three parts
d <- rbind(part1, part2, part3, part4)
# Add date
d <- data.frame(Date, d, stringsAsFactors=FALSE)
# Cleanup
d$Locality <- gsub("\r\n","", d$Locality)
d$Locality <- gsub("Statewide Totals:","TOTAL", d$Locality)
d$Precincts <- gsub(",","", d$Precincts)
d$Active <- gsub(",","", d$Active)
d$Inactive <- gsub(",","", d$Inactive)
d$Overseas <- gsub(",","", d$Overseas)
d$Total <- gsub(",","", d$Total)
# Make sure all rows are present
stopifnot(nrow(d) == 135)
# Make sure all Locality rows add up to TOTAL row
stopifnot(apply(data.matrix(d[-nrow(d),-1:-2]), 2, sum) == as.numeric(d[nrow(d),-1:-2]))
d
}
################################################################################
base.URL <- "http://www.sbe.virginia.gov/cms/Statistics_Polling_Places/Registration_Statistics/"
### Early 2003 and 2004 files follow very different "rules".
### Gave up trying to integrate rules needed for 2003/2004 files with later files.
#URL <- paste(base.URL, "2003/County_City/Number_of_Precincts_Registered_Voters_by_County_City_-_January_1,_2003.html", sep="")
#URL <- paste(base.URL, "2004/County_City/Number_of_Precincts_Registered_Voters_by_County_City_-_January_1,_2004.html", sep="")
pages <- c("2004/County_City/Number_of_Precincts_Registered_Voters_by_County_City_-_July_1,_2004.html",
"2004/County_City/Number_of_Precincts_Registered_Voters_by_County_City_for_the_November_General_Election_2004.html", ##
"2005/County_City/Counties_Cities_-_January_1,_2005.html",
"2005/County_City/Counties_Cities_-_July_1,_2005.html",
"2006/County_City/Counties_Cities_-_January_1,_2006.html",
"2006/County_City/Counties_Cities_-_April_1,_2006.html",
"2006/County_City/Counties_Cities_-_July_1,_2006.html",
"2006/County_City/Counties_Cities_-_November_1,_2006.html")
VA.registration <- NULL
for (i in 1:length(pages))
{
URL <- paste(base.URL, pages[i], sep="")
time.point <- grab.web.page(URL)
# KLUDGE fix for irregularity in file
if (i == 2) time.point$Date <- as.Date("2004-10-04")
cat(as.character(time.point$Date[1]), "\n")
flush.console()
VA.registration <- rbind(VA.registration, time.point)
}
write.csv(VA.registration, "Virginia-Voter-Registration-2004-2006.csv",
row.names=FALSE)