This tutorial aligns with DataCamp’s Introduction to Regression in R tutorial.

Linear Regression

Linear Regression is an incredibly important tool. It uses multiple numerical independent variables to predict a single output/dependent variable.

The output variable is generally a continuous numerical variable.

Outcomes:

Terms:

Help:

Minimal Example

library(tidyverse)

t_data <- tibble(
  from = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
  to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
  extra = c(0, 1, 0, 1, 0, 1, 0, 1, 1, 1)
)

# Quick viz of data showing the relationship.
pairs( t_data )

# Create a linear regression model predicting `to` with `from` and using the entire dataset.
model = lm(to ~ from, data = t_data)

# Show a summary of the linear model. Pay attention to these key items:
#   Residuals = errors in each row between actual & predicted
#   Std Error = the squared difference between
#      the predicted and actual values. 
#   Coefficients: 
#     Estimate: value of the change in input to output
#     Std. Error: averages squared diff between prediction / actual
#     p Value: probability of the estimate being a result of random chance
#   Residual standard error: the overall avg difference between actual
#     and predicted for the entire model. 
#   Adj R^2,  the % of variation explained by the model
summary(model)
## 
## Call:
## lm(formula = to ~ from, data = t_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.654 -0.554  0.370  0.403  0.430 
## 
## Coefficients:
##             Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)  11.6667     0.3732    31.3 0.0000000012 ***
## from         -1.0121     0.0601   -16.8 0.0000001576 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.546 on 8 degrees of freedom
## Multiple R-squared:  0.973,  Adjusted R-squared:  0.969 
## F-statistic:  283 on 1 and 8 DF,  p-value: 0.000000158

Full example

library(tidyverse)

t_data <- tibble(
  from = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
  to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1)
)

# Quick viz of data showing the relationship. Look for any obvious correlation values.
# We cant to avoid any perfectly correlated data, as that will cause problems in our regression.
pairs( t_data )

# Show above, but in numbers.
cor( t_data, use = 'pairwise.complete.obs' )
##        from     to
## from  1.000 -0.986
## to   -0.986  1.000
# Create a linear regression model predicting `to` with `from` and using the train dataset.
model = lm(to ~ from, data = t_data)

# Show a summary of the linear model. Pay attention to these key items:
#   Residuals = errors in each row between actual & predicted
#   Std Error = the squared difference between
#      the predicted and actual values. 
#   Coefficients: 
#     Estimate: value of the change in input to output
#     Std. Error: averages squared diff between prediction / actual
#     p Value: probability of the estimate being a result of random chance
#   Residual standard error: the overall avg difference between actual
#     and predicted for the entire model. 
#   Adj R^2,  the % of variation explained by the model
summary(model)
## 
## Call:
## lm(formula = to ~ from, data = t_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.654 -0.554  0.370  0.403  0.430 
## 
## Coefficients:
##             Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)  11.6667     0.3732    31.3 0.0000000012 ***
## from         -1.0121     0.0601   -16.8 0.0000001576 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.546 on 8 degrees of freedom
## Multiple R-squared:  0.973,  Adjusted R-squared:  0.969 
## F-statistic:  283 on 1 and 8 DF,  p-value: 0.000000158
# Show a plot of residuals / errors
hist(model$residuals)

# If we plug in a 95% confidence interval, how accurate are we?
confint(model, level = 0.95)
##             2.5 % 97.5 %
## (Intercept) 10.81 12.527
## from        -1.15 -0.873
# Now look at how the data performs on the data.
# This function will take in a tibble and a model, test it,
#   and calculate the resulting average error per item.
rmse <- function( m, tibble, dependent_variable ) {
  results <- predict(m, tibble)
  errors <- results - dependent_variable
  return( sqrt(mean(errors^2, na.rm = TRUE)))
}

rmse(model, t_data, t_data$to)
## [1] 0.489

Preprocessing

We have some clean-up to do with variables used in regression.

Low-variance variables

Look for values that have low variation and remove them from the dataset.

library(tidyverse)
library(caret)

t_data <- tibble(
  from = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
  to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
  group_a = c(1, 1, 1, 1, 1, 1, 2, 1, 1, 1),
  group_b = c(3, 3, 1, 1, 1, 1, 2, 1, 2, 5),
)

# saveMetrics give us output description. 
#   freqRatio: top most frequent item / 2nd top most freq item. Goal is to be close to 1
#   percentUnique: unique(items) / total items
#
# Output
#   freqRatio: 
#     from and to are 1.0, meaning that the most frequent value is just as likely
#       as the next-most likely value. 
#     However, group_a is 9.0, meaning that the most likely value  (1) is 9 times as common 
#       as the next most likely (2). Remove from our model!
#     Group_b is better, with the most likely value (1) being 2.5 times as likely as the 
#       next most (2 or 3)
#   
nearZeroVar(t_data, saveMetrics = TRUE)

Remove perfect correlations

Look for values that are perfectly-correlated with each other, and remove all but one.

library(tidyverse)
library(ggcorrplot)

t_data <- tibble(
  from = c(1, 2, 3, 4, 5, 6, 7, 8, NA, 10),
  to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
  is_major = c(1, 1, 1, 1, 1, 1, 0, 0, 0, NA),
  is_not_major = c(0, 0, 0, 0, 0, 0, 1, 1, 1, NA),
  from_x_10 = from * 10
)

# Pairs shows a visual of the relationship in our model
# Look for numerical values in a straight line
pairs( t_data)

# Cor returns the correlation. Look for values near 1.0 or -1.0
# use allows us to use more data, ignoring pairs with a NA 
cor( t_data, use = 'pairwise.complete.obs' )
##                from     to is_major is_not_major from_x_10
## from          1.000 -0.985   -0.756        0.756     1.000
## to           -0.985  1.000    0.836       -0.836    -0.985
## is_major     -0.756  0.836    1.000       -1.000    -0.756
## is_not_major  0.756 -0.836   -1.000        1.000     0.756
## from_x_10     1.000 -0.985   -0.756        0.756     1.000
# Visual of above 
ggcorrplot::ggcorrplot(cor( t_data, use = 'pairwise.complete.obs' ))

Remove outliers

Look for values with outlier information

library(tidyverse)
library(ggcorrplot)

t_data <- tibble(
  from = c(1, 2, 3, 4, 5, 6, 7, 8, NA, 10),
  to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
  absences = c(0, 0, 0, 23, 0, 0, 2, 3, 1, 1)
)

# Hist is a quick function to display outlier values.
# As a general rule, we may want to remove values that are >1.5 the 
# interquartile range (difference between 50% and 75% values). However, this is a
# judgement call.
hist( t_data$absences)

summary( t_data$absences)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.50    3.00    1.75   23.00
# To cleanup, you can take 3 approaches:
clean_t_data <- t_data %>% 
  # option a: remove rows
  filter(absences < 50) %>%
  # option b: turn into a yes/no 1/0 variable
  mutate(absences_excessive = ifelse(absences > 5, 1, 0)) %>% 
  # option c: cap the field
  mutate(absences_capped = ifelse(absences > 5, 5, absences))

print(clean_t_data)

Add dummy variables

Change text fields into 1/0 values.

library(tidyverse)
library(ggcorrplot)

t_data <- tibble(
  from = c(1, 2, 3, 4, 5, 6, 7, 8, NA, 10),
  to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
  major = c("Y", "Y", "Y", "Y", "Y", "Y", "N", "N", "N", NA)
)

table( t_data$major)
## 
## N Y 
## 3 6
# To cleanup, use ifelse
clean_t_data <- t_data %>% 
  mutate(major01 = ifelse(major == 'Y', 1, 0))

print(clean_t_data)

Collapse fields

You may also want to join text categories into smaller groups

library(tidyverse)
library(ggcorrplot)

t_data <- tibble(
  from = c(1, 2, 3, 4, 5, 6, 7, 8, NA, 10),
  to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
  major = c("Act", "Act", "Fin", "Act", "Marketing", "Act", "Act", "Act", "Act", "Act")
)

table( t_data$major)
## 
##       Act       Fin Marketing 
##         8         1         1
# To cleanup, use ifelse
clean_t_data <- t_data %>% 
  mutate(major_not_accounting01 = ifelse(major == 'Fin' | major == "Marketing", 1, 0))

print(clean_t_data)