### A demonstration of correlation and chi-square in R

# car gives us nice scatterplots
library(car)

# Load Countries dataset
load("~/Desktop/271b/data/Countries.Rdata")
summary(Countries)

# merge in data about corruption
Corruption = read.csv("2012_Corruption_Index.csv", header=TRUE)
summary(Corruption)
Countries = merge(Countries, Corruption, by="Country")

# Add in Google's dataset of government requests to remove content
Requests = read.csv("Removal_Requests.csv")
summary(Requests)

# Create a new variable for total number of requests
Requests$total.takedowns = Requests$Court.Orders + Requests$Executive..Police..etc.


### Correlation: Linear relationships between metric variables

# Let's examine the relationship between corruption and takedown requests
# Note that there are multiple rows per country in the Requests dataframe
head(Requests)

# To merge our datasets, we first need to aggregate the takedown data by Country.
# (we'll lose some variables, such as the product the request referred to)
R2 = aggregate(Requests[,c("Court.Orders", "Executive..Police..etc.", "total.takedowns")], list(Country = Requests$Country), sum)
head(R2)

# Perform the merge
Countries = merge(Countries, R2, by="Country", all = TRUE)

head(Countries)


# Use a scatterplot to see how linear the relationship looks
scatterplot(Countries$cpi, Countries$total.takedowns)

#check the correlation
cor.test(Countries$cpi, Countries$total.takedowns)

# the cor function allows us to construct a correlation matrix
cor(Countries[,c("gdp2012", "cpi", "total.takedowns")], use = "pairwise.complete.obs")

# the output is actually a matrix object, so we can do things like square each value
# to get R-squared
cor(Countries[,c("gdp2012", "cpi", "total.takedowns")], use = "pairwise.complete.obs")**2


### Chi-square: Testing for relationships between categorical variables
# Here are three different approaches, depending on structure of dataset

## 1. Two categorical variables

# Recode corruption as a binary variable
Countries$high_cpi = ifelse(Countries$cpi > mean(Countries$cpi, na.rm = TRUE), "Trustworthy", "Corrupt")
Countries$high_cpi

# Look at the frequency table between region and whether a country is corrupt
table(Countries$region, Countries$high_cpi)

# We store the results of our chi-square test so we can extract more
# values from the output
cs = chisq.test(Countries$region, Countries$high_cpi)

# Examine the test result
cs

# Look at the std. residuals to see which regions contribute most to the result
cs$stdres

# Check the expected counts to see if any are less than 5 and if
# we should therefore try Fisher's exact test
cs$expected

# Use Fisher's exact test in this case:
fisher.test(Countries$region, Countries$high_cpi)


## 2. Count data, one variable in columns

# Consider each request to be the unit of analysis, and consider two variables:
# Whether it came from a corrupt or trustworthy country; and whether it came
# through a court order or executive/police action.  We want to know if these
# variables are independent or related.

# We can use aggregate to collapse the rows to just the high_cpi variable
Corrupt_Source = aggregate(Countries[,c("Court.Orders", "Executive..Police..etc.")], list(high_cpi = Countries$high_cpi), sum, na.rm = TRUE)

# Note that we've created a table of counts:
Corrupt_Source

# Not required, but we can add row names to make the chi-square output prettier
rownames(Corrupt_Source)=Corrupt_Source$high_cpi

# plug our count table into the chi-square test
cs = chisq.test(Corrupt_Source[,c(2,3)])
cs

# Look at the standardized residuals to see which direction the effect is in
cs$stdres

# Check the expected counts to see if any are less than 5 and if
# we should therefore try Fisher's exact test
cs$expected


## 3. Count data, both variables in rows

# Let's see if corrupt countries are likely to target different products
# than trustworthy ones.  For this, we can't aggregate our data by Country
# so go back to the original request data, and merge in the high_cpi variable
# also, remove countries that are missing corruption data
Requests2 = merge(Countries[,c("Country", "high_cpi")], Requests, by="Country")
Requests2 = Requests2[ ! is.na(Requests2$high_cpi),]

summary(Requests2)

# We want separate columns for takedown requests from corrupt countries
# and from trustworthy countries.  Here, we create both columns, and copy
# each value for total.takedowns to the appropriate one.
Corrupt_Product = Requests2[,c("Product","high_cpi")]
Corrupt_Product$Corrupt = ifelse(Requests2$high_cpi == "Corrupt", Requests2$total.takedowns, 0)
Corrupt_Product$Trustworthy = ifelse(Requests2$high_cpi == "Trustworthy", Requests2$total.takedowns, 0)

# Observe that each row only has a positive value in one of the two new columns
head(Corrupt_Product)

# Next we sum Corrupt and Trustworthy columns for each product.
Corrupt_Product =  aggregate(Corrupt_Product[,c("Corrupt","Trustworthy")], list( Product = Corrupt_Product$Product), sum)

# We are left with a contingency table
Corrupt_Product

# We could have also created the table in one step, using the cast command
library(reshape)
Corrupt_Product = cast(Requests2, Product ~ high_cpi , fun = sum, value = c("total.takedowns"))
Corrupt_Product

# Run a chi-square test as before
cs = chisq.test(Corrupt_Product[,c(2,3)])
cs

# Check standardized residuals
cs$stdres

# And expected values
cs$expected

# The fisher test is probably too computationally intensive to run
fisher.test(Corrupt_Product[,c(2,3)])

# could also use monte-carlo simulation to check significance
chisq.test(Corrupt_Product[,c(2,3)], simulate.p.value = T)