### A demonstration of multivariate regression in R

## 1. Coding issues

# use car for scatterplots
library(car)

# Load the video data
Videos = read.delim("videos.txt")
summary(Videos)

# Let's examine the categories variable
summary(Videos$category)

# Notice the mysterious first entry in the summary,
# which has a number but no text
# We can confirm this by checking the values of this variable
levels(Videos$category)

# It looks like missing categories are incorrectly 
# coded as empty strings.  We should correct these to NA's
Videos$category[Videos$category == ""] = NA

# the 9 data points are gone, but the "" still 
# appears as a level.
summary(Videos$category)

# Can fix this by recreating the factor in R
Videos$category = factor(Videos$category)

# Next, let's look at the rate variable, which
# represents the average rating given to a video on a
# 5-point scale.  We would typically begin by looking
# at a histogram
hist(Videos$rate)

# Notice the surprising peak around zero - is that meaningful?

# Let's check if these values are exactly zero, or just close to zero.
# Create a vector of videos that have a zero rating exactly
zero_rate = Videos$rate == 0

# And see how many of these videos there are
summary(zero_rate)

# We may check to see how many ratings these videos have.
# According to the documentation for the dataset, the number
# of ratings a video has is coded in the ratings vector
Videos$ratings[zero_rate]

# See all the zeros?  We conclude that videos with no 
# ratings are being coded
# with an average rating of zero.  This would throw off
# our means and other statistics
# get the vector of videos with no ratings
no_rate = Videos$ratings == 0

# and recode the rate variable as missing for these videos
Videos$rate[no_rate] = NA

# Let's similarly inspect the age variable
hist(Videos$age)
# Notice the peak at age 0

# get the vector of videos with age zero to look closer
zero_age = Videos$age == 0

# could it be that these are the unrated videos again?
Videos$ratings[zero_age]

# Let's look at all of the columns to see if anything
# jumps out about these datapoints
# We need to subset the dataset, pulling out just the rows
# which for the zero-age videos
Videos[Videos$age == 0,]

# Notice that they all come from the UNA category
# We can check that all Videos in UNA also have zero age
Videos$age[Videos$category == " UNA "]


# At this point, I went and read the documentation
# on the researchers' website.  The age is supposed to
# show days from the formation of Youtube
# to the time the video was uploaded.

# Because we don't expect many videos the day
# youtube was created, this confirms out suspicion
# that the zeros represent missing values
# This also suggests that the UNA category represents
# missing values, so lets recode all of these as NAs
Videos$age[zero_age] = NA
Videos$category[zero_age] = NA
Videos$category = factor(Videos$category)

# Moreover, for an age variable, we'd rather 
# measure from the day the video was created to
# the day the data was gathered.
# We can do that by subtracting the maximum age from each
# video's age.
Videos$age = max(Videos$age, na.rm = T)- Videos$age

# Let's see if that looks more like what we'd expect
hist(Videos$age)


## 2. Regression Preparation

# If we were exploring, rather than testing a specific
# hypothesis, we could begin by checking the correlation
# matrix
# However, do NOT do this and pick out promising variables,
# then test their significance in regression.  This creates
# inflated error rates, breaking down the machinery
# of hypothesis testing.
cor(Videos[,c(3,5:9)], use = "pairwise.complete.obs")


# Let's investigate the determinants of something any
# video creator might care about - the number of views
hist(Videos$views)

# First, we probably want to transform views to make
# it more normal.
hist(log10(Videos$views))

# Suppose our theory tells us that the rate
# (representing video quality) and the category
# are important determinants of the number of views
# We also believe that it makes sense to control for 
# age, since older videos obviously have more time to 
# accumulate views.

# Our plan is therefore to run a hierarchical regression,
# beginning with age, and then adding category and rate.
# Since we are planning to compare our models against
# each other, we should first pull out the rows that have
# no missing values across all of these variables.  Otherwise,
# we'd have to delete cases as we add each new varible, and
# we won't be able to compare models with ANOVA.
# We can get the rows that have values for all of these
# variables with:
lim_rows = complete.cases(Videos$views, Videos$age, Videos$category, Videos$rate)
lim_rows

# Pull those rows out to create a subset of the dataset
Videos_lim = Videos[lim_rows,]


# 3. Regression Analysis

# The scatterplot shows a strong relationship
scatterplot(Videos_lim$age, log10(Videos_lim$views))

# Run a simple regression
model1 = lm(log10(views) ~ age, data = Videos_lim)
summary(model1)

# use the plot command for common diagnostics
plot(model1)

# Let's add in the category variable.
# This is automatically dummy-coded
# for us.  Can see dummy variables with
contrasts(Videos$category)


model2 = lm(log10(views) ~ age + category, data = Videos_lim)
summary(model2)
plot(model2)

# it seems like the category matters for the number of views,
# especially for Entertainment, Film and Animation, Music,
# Gaming, and Sports

# check the improvement from model 1 to model 2 is statistically
# significant
anova(model1, model2)

# we could also compare the R squares for the models, or
# compare them with Akaike's information criterion
AIC(model1)
AIC(model2)
# We want a smaller AIC - that indicates a better 
# fitting or more parsimonious model

# Let's see what happens when we add the rate variable
model3 = lm(log10(views) ~  age + category + rate, data = Videos_lim)
summary(model3)
plot(model3)

# Notice that rate has a highly significant effect
# also, look at what happened to the effects of the
# categories!

# compare the model improvement with anova
anova(model2, model3)

# and check the AIC
AIC(model2)
AIC(model3)