### A demonstration of multivariate regression in R ## 1. Coding issues # use car for scatterplots library(car) # Load the video data Videos = read.delim("videos.txt") summary(Videos) # Let's examine the categories variable summary(Videos$category) # Notice the mysterious first entry in the summary, # which has a number but no text # We can confirm this by checking the values of this variable levels(Videos$category) # It looks like missing categories are incorrectly # coded as empty strings. We should correct these to NA's Videos$category[Videos$category == ""] = NA # the 9 data points are gone, but the "" still # appears as a level. summary(Videos$category) # Can fix this by recreating the factor in R Videos$category = factor(Videos$category) # Next, let's look at the rate variable, which # represents the average rating given to a video on a # 5-point scale. We would typically begin by looking # at a histogram hist(Videos$rate) # Notice the surprising peak around zero - is that meaningful? # Let's check if these values are exactly zero, or just close to zero. # Create a vector of videos that have a zero rating exactly zero_rate = Videos$rate == 0 # And see how many of these videos there are summary(zero_rate) # We may check to see how many ratings these videos have. # According to the documentation for the dataset, the number # of ratings a video has is coded in the ratings vector Videos$ratings[zero_rate] # See all the zeros? We conclude that videos with no # ratings are being coded # with an average rating of zero. This would throw off # our means and other statistics # get the vector of videos with no ratings no_rate = Videos$ratings == 0 # and recode the rate variable as missing for these videos Videos$rate[no_rate] = NA # Let's similarly inspect the age variable hist(Videos$age) # Notice the peak at age 0 # get the vector of videos with age zero to look closer zero_age = Videos$age == 0 # could it be that these are the unrated videos again? Videos$ratings[zero_age] # Let's look at all of the columns to see if anything # jumps out about these datapoints # We need to subset the dataset, pulling out just the rows # which for the zero-age videos Videos[Videos$age == 0,] # Notice that they all come from the UNA category # We can check that all Videos in UNA also have zero age Videos$age[Videos$category == " UNA "] # At this point, I went and read the documentation # on the researchers' website. The age is supposed to # show days from the formation of Youtube # to the time the video was uploaded. # Because we don't expect many videos the day # youtube was created, this confirms out suspicion # that the zeros represent missing values # This also suggests that the UNA category represents # missing values, so lets recode all of these as NAs Videos$age[zero_age] = NA Videos$category[zero_age] = NA Videos$category = factor(Videos$category) # Moreover, for an age variable, we'd rather # measure from the day the video was created to # the day the data was gathered. # We can do that by subtracting the maximum age from each # video's age. Videos$age = max(Videos$age, na.rm = T)- Videos$age # Let's see if that looks more like what we'd expect hist(Videos$age) ## 2. Regression Preparation # If we were exploring, rather than testing a specific # hypothesis, we could begin by checking the correlation # matrix # However, do NOT do this and pick out promising variables, # then test their significance in regression. This creates # inflated error rates, breaking down the machinery # of hypothesis testing. cor(Videos[,c(3,5:9)], use = "pairwise.complete.obs") # Let's investigate the determinants of something any # video creator might care about - the number of views hist(Videos$views) # First, we probably want to transform views to make # it more normal. hist(log10(Videos$views)) # Suppose our theory tells us that the rate # (representing video quality) and the category # are important determinants of the number of views # We also believe that it makes sense to control for # age, since older videos obviously have more time to # accumulate views. # Our plan is therefore to run a hierarchical regression, # beginning with age, and then adding category and rate. # Since we are planning to compare our models against # each other, we should first pull out the rows that have # no missing values across all of these variables. Otherwise, # we'd have to delete cases as we add each new varible, and # we won't be able to compare models with ANOVA. # We can get the rows that have values for all of these # variables with: lim_rows = complete.cases(Videos$views, Videos$age, Videos$category, Videos$rate) lim_rows # Pull those rows out to create a subset of the dataset Videos_lim = Videos[lim_rows,] # 3. Regression Analysis # The scatterplot shows a strong relationship scatterplot(Videos_lim$age, log10(Videos_lim$views)) # Run a simple regression model1 = lm(log10(views) ~ age, data = Videos_lim) summary(model1) # use the plot command for common diagnostics plot(model1) # Let's add in the category variable. # This is automatically dummy-coded # for us. Can see dummy variables with contrasts(Videos$category) model2 = lm(log10(views) ~ age + category, data = Videos_lim) summary(model2) plot(model2) # it seems like the category matters for the number of views, # especially for Entertainment, Film and Animation, Music, # Gaming, and Sports # check the improvement from model 1 to model 2 is statistically # significant anova(model1, model2) # we could also compare the R squares for the models, or # compare them with Akaike's information criterion AIC(model1) AIC(model2) # We want a smaller AIC - that indicates a better # fitting or more parsimonious model # Let's see what happens when we add the rate variable model3 = lm(log10(views) ~ age + category + rate, data = Videos_lim) summary(model3) plot(model3) # Notice that rate has a highly significant effect # also, look at what happened to the effects of the # categories! # compare the model improvement with anova anova(model2, model3) # and check the AIC AIC(model2) AIC(model3)