# Need car for the scatterplot function library(car) load("Countries.Rdata") Countries$gdp = Countries$gdp2012 summary(Countries) rownames(Countries) = Countries$Country # observe the original scatterplot with one outlier on the gdp axis scatterplot(internet_users_2011~gdp, reg.line=lm, smooth=TRUE, spread=TRUE, id.method="identify", boxplots='xy', span=0.5, data=Countries) # let's sort the data to see what the outlier is # note that order gives a vector of positions order(-Countries$gdp) # Can use it as an index vector, either for a vector Countries$gdp[order(-Countries$gdp)] # Or to subset an entire dataframe and look at the first few entries Countries2 = Countries[order(-Countries$gdp), c("Country","gdp")] head(Countries2) # Next, make a boolean vector to identify countries missing internet data # Note that using != does not work correctly, because # NA takes over the whole expression Countries$internet_users_2011 != NA # Use the is.na function instead ! is.na(Countries$internet_users_2011) Can use this as an index vector to subset our dataframe Countries3 = Countries[! is.na(Countries$internet_users_2011), ] # And make the scatterplot again scatterplot(internet_users_2011~gdp, reg.line=lm, smooth=TRUE, spread=TRUE, id.method="identify", boxplots='xy', span=0.5, data=Countries3)