### A demonstration of correlation and chi-square in R # car gives us nice scatterplots library(car) # Load Countries dataset load("~/Desktop/271b/data/Countries.Rdata") summary(Countries) # merge in data about corruption Corruption = read.csv("2012_Corruption_Index.csv", header=TRUE) summary(Corruption) Countries = merge(Countries, Corruption, by="Country") # Add in Google's dataset of government requests to remove content Requests = read.csv("Removal_Requests.csv") summary(Requests) # Create a new variable for total number of requests Requests$total.takedowns = Requests$Court.Orders + Requests$Executive..Police..etc. ### Correlation: Linear relationships between metric variables # Let's examine the relationship between corruption and takedown requests # Note that there are multiple rows per country in the Requests dataframe head(Requests) # To merge our datasets, we first need to aggregate the takedown data by Country. # (we'll lose some variables, such as the product the request referred to) R2 = aggregate(Requests[,c("Court.Orders", "Executive..Police..etc.", "total.takedowns")], list(Country = Requests$Country), sum) head(R2) # Perform the merge Countries = merge(Countries, R2, by="Country", all = TRUE) head(Countries) # Use a scatterplot to see how linear the relationship looks scatterplot(Countries$cpi, Countries$total.takedowns) #check the correlation cor.test(Countries$cpi, Countries$total.takedowns) # the cor function allows us to construct a correlation matrix cor(Countries[,c("gdp2012", "cpi", "total.takedowns")], use = "pairwise.complete.obs") # the output is actually a matrix object, so we can do things like square each value # to get R-squared cor(Countries[,c("gdp2012", "cpi", "total.takedowns")], use = "pairwise.complete.obs")**2 ### Chi-square: Testing for relationships between categorical variables # Here are three different approaches, depending on structure of dataset ## 1. Two categorical variables # Recode corruption as a binary variable Countries$high_cpi = ifelse(Countries$cpi > mean(Countries$cpi, na.rm = TRUE), "Trustworthy", "Corrupt") Countries$high_cpi # Look at the frequency table between region and whether a country is corrupt table(Countries$region, Countries$high_cpi) # We store the results of our chi-square test so we can extract more # values from the output cs = chisq.test(Countries$region, Countries$high_cpi) # Examine the test result cs # Look at the std. residuals to see which regions contribute most to the result cs$stdres # Check the expected counts to see if any are less than 5 and if # we should therefore try Fisher's exact test cs$expected # Use Fisher's exact test in this case: fisher.test(Countries$region, Countries$high_cpi) ## 2. Count data, one variable in columns # Consider each request to be the unit of analysis, and consider two variables: # Whether it came from a corrupt or trustworthy country; and whether it came # through a court order or executive/police action. We want to know if these # variables are independent or related. # We can use aggregate to collapse the rows to just the high_cpi variable Corrupt_Source = aggregate(Countries[,c("Court.Orders", "Executive..Police..etc.")], list(high_cpi = Countries$high_cpi), sum, na.rm = TRUE) # Note that we've created a table of counts: Corrupt_Source # Not required, but we can add row names to make the chi-square output prettier rownames(Corrupt_Source)=Corrupt_Source$high_cpi # plug our count table into the chi-square test cs = chisq.test(Corrupt_Source[,c(2,3)]) cs # Look at the standardized residuals to see which direction the effect is in cs$stdres # Check the expected counts to see if any are less than 5 and if # we should therefore try Fisher's exact test cs$expected ## 3. Count data, both variables in rows # Let's see if corrupt countries are likely to target different products # than trustworthy ones. For this, we can't aggregate our data by Country # so go back to the original request data, and merge in the high_cpi variable # also, remove countries that are missing corruption data Requests2 = merge(Countries[,c("Country", "high_cpi")], Requests, by="Country") Requests2 = Requests2[ ! is.na(Requests2$high_cpi),] summary(Requests2) # We want separate columns for takedown requests from corrupt countries # and from trustworthy countries. Here, we create both columns, and copy # each value for total.takedowns to the appropriate one. Corrupt_Product = Requests2[,c("Product","high_cpi")] Corrupt_Product$Corrupt = ifelse(Requests2$high_cpi == "Corrupt", Requests2$total.takedowns, 0) Corrupt_Product$Trustworthy = ifelse(Requests2$high_cpi == "Trustworthy", Requests2$total.takedowns, 0) # Observe that each row only has a positive value in one of the two new columns head(Corrupt_Product) # Next we sum Corrupt and Trustworthy columns for each product. Corrupt_Product = aggregate(Corrupt_Product[,c("Corrupt","Trustworthy")], list( Product = Corrupt_Product$Product), sum) # We are left with a contingency table Corrupt_Product # We could have also created the table in one step, using the cast command library(reshape) Corrupt_Product = cast(Requests2, Product ~ high_cpi , fun = sum, value = c("total.takedowns")) Corrupt_Product # Run a chi-square test as before cs = chisq.test(Corrupt_Product[,c(2,3)]) cs # Check standardized residuals cs$stdres # And expected values cs$expected # The fisher test is probably too computationally intensive to run fisher.test(Corrupt_Product[,c(2,3)]) # could also use monte-carlo simulation to check significance chisq.test(Corrupt_Product[,c(2,3)], simulate.p.value = T)