This tutorial aligns with DataCamp’s Introduction to Regression in R tutorial.
Linear Regression is an incredibly important tool. It uses multiple numerical independent variables to predict a single output/dependent variable.
The output variable is generally a continuous numerical variable.
Outcomes:
Terms:
Help:
library(tidyverse)
t_data <- tibble(
from = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
extra = c(0, 1, 0, 1, 0, 1, 0, 1, 1, 1)
)
# Quick viz of data showing the relationship.
pairs( t_data )
# Create a linear regression model predicting `to` with `from` and using the entire dataset.
model = lm(to ~ from, data = t_data)
# Show a summary of the linear model. Pay attention to these key items:
# Residuals = errors in each row between actual & predicted
# Std Error = the squared difference between
# the predicted and actual values.
# Coefficients:
# Estimate: value of the change in input to output
# Std. Error: averages squared diff between prediction / actual
# p Value: probability of the estimate being a result of random chance
# Residual standard error: the overall avg difference between actual
# and predicted for the entire model.
# Adj R^2, the % of variation explained by the model
summary(model)
##
## Call:
## lm(formula = to ~ from, data = t_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.654 -0.554 0.370 0.403 0.430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.6667 0.3732 31.3 0.0000000012 ***
## from -1.0121 0.0601 -16.8 0.0000001576 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.546 on 8 degrees of freedom
## Multiple R-squared: 0.973, Adjusted R-squared: 0.969
## F-statistic: 283 on 1 and 8 DF, p-value: 0.000000158
library(tidyverse)
t_data <- tibble(
from = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1)
)
# Quick viz of data showing the relationship. Look for any obvious correlation values.
# We cant to avoid any perfectly correlated data, as that will cause problems in our regression.
pairs( t_data )
# Show above, but in numbers.
cor( t_data, use = 'pairwise.complete.obs' )
## from to
## from 1.000 -0.986
## to -0.986 1.000
# Create a linear regression model predicting `to` with `from` and using the train dataset.
model = lm(to ~ from, data = t_data)
# Show a summary of the linear model. Pay attention to these key items:
# Residuals = errors in each row between actual & predicted
# Std Error = the squared difference between
# the predicted and actual values.
# Coefficients:
# Estimate: value of the change in input to output
# Std. Error: averages squared diff between prediction / actual
# p Value: probability of the estimate being a result of random chance
# Residual standard error: the overall avg difference between actual
# and predicted for the entire model.
# Adj R^2, the % of variation explained by the model
summary(model)
##
## Call:
## lm(formula = to ~ from, data = t_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.654 -0.554 0.370 0.403 0.430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.6667 0.3732 31.3 0.0000000012 ***
## from -1.0121 0.0601 -16.8 0.0000001576 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.546 on 8 degrees of freedom
## Multiple R-squared: 0.973, Adjusted R-squared: 0.969
## F-statistic: 283 on 1 and 8 DF, p-value: 0.000000158
# Show a plot of residuals / errors
hist(model$residuals)
# If we plug in a 95% confidence interval, how accurate are we?
confint(model, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 10.81 12.527
## from -1.15 -0.873
# Now look at how the data performs on the data.
# This function will take in a tibble and a model, test it,
# and calculate the resulting average error per item.
rmse <- function( m, tibble, dependent_variable ) {
results <- predict(m, tibble)
errors <- results - dependent_variable
return( sqrt(mean(errors^2, na.rm = TRUE)))
}
rmse(model, t_data, t_data$to)
## [1] 0.489
We have some clean-up to do with variables used in regression.
Look for values that have low variation and remove them from the dataset.
library(tidyverse)
library(caret)
t_data <- tibble(
from = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
group_a = c(1, 1, 1, 1, 1, 1, 2, 1, 1, 1),
group_b = c(3, 3, 1, 1, 1, 1, 2, 1, 2, 5),
)
# saveMetrics give us output description.
# freqRatio: top most frequent item / 2nd top most freq item. Goal is to be close to 1
# percentUnique: unique(items) / total items
#
# Output
# freqRatio:
# from and to are 1.0, meaning that the most frequent value is just as likely
# as the next-most likely value.
# However, group_a is 9.0, meaning that the most likely value (1) is 9 times as common
# as the next most likely (2). Remove from our model!
# Group_b is better, with the most likely value (1) being 2.5 times as likely as the
# next most (2 or 3)
#
nearZeroVar(t_data, saveMetrics = TRUE)
Look for values that are perfectly-correlated with each other, and remove all but one.
library(tidyverse)
library(ggcorrplot)
t_data <- tibble(
from = c(1, 2, 3, 4, 5, 6, 7, 8, NA, 10),
to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
is_major = c(1, 1, 1, 1, 1, 1, 0, 0, 0, NA),
is_not_major = c(0, 0, 0, 0, 0, 0, 1, 1, 1, NA),
from_x_10 = from * 10
)
# Pairs shows a visual of the relationship in our model
# Look for numerical values in a straight line
pairs( t_data)
# Cor returns the correlation. Look for values near 1.0 or -1.0
# use allows us to use more data, ignoring pairs with a NA
cor( t_data, use = 'pairwise.complete.obs' )
## from to is_major is_not_major from_x_10
## from 1.000 -0.985 -0.756 0.756 1.000
## to -0.985 1.000 0.836 -0.836 -0.985
## is_major -0.756 0.836 1.000 -1.000 -0.756
## is_not_major 0.756 -0.836 -1.000 1.000 0.756
## from_x_10 1.000 -0.985 -0.756 0.756 1.000
# Visual of above
ggcorrplot::ggcorrplot(cor( t_data, use = 'pairwise.complete.obs' ))
Look for values with outlier information
library(tidyverse)
library(ggcorrplot)
t_data <- tibble(
from = c(1, 2, 3, 4, 5, 6, 7, 8, NA, 10),
to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
absences = c(0, 0, 0, 23, 0, 0, 2, 3, 1, 1)
)
# Hist is a quick function to display outlier values.
# As a general rule, we may want to remove values that are >1.5 the
# interquartile range (difference between 50% and 75% values). However, this is a
# judgement call.
hist( t_data$absences)
summary( t_data$absences)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.50 3.00 1.75 23.00
# To cleanup, you can take 3 approaches:
clean_t_data <- t_data %>%
# option a: remove rows
filter(absences < 50) %>%
# option b: turn into a yes/no 1/0 variable
mutate(absences_excessive = ifelse(absences > 5, 1, 0)) %>%
# option c: cap the field
mutate(absences_capped = ifelse(absences > 5, 5, absences))
print(clean_t_data)
Change text fields into 1/0 values.
library(tidyverse)
library(ggcorrplot)
t_data <- tibble(
from = c(1, 2, 3, 4, 5, 6, 7, 8, NA, 10),
to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
major = c("Y", "Y", "Y", "Y", "Y", "Y", "N", "N", "N", NA)
)
table( t_data$major)
##
## N Y
## 3 6
# To cleanup, use ifelse
clean_t_data <- t_data %>%
mutate(major01 = ifelse(major == 'Y', 1, 0))
print(clean_t_data)
You may also want to join text categories into smaller groups
library(tidyverse)
library(ggcorrplot)
t_data <- tibble(
from = c(1, 2, 3, 4, 5, 6, 7, 8, NA, 10),
to = c(10, 10, 8, 8, 7, 6, 5, 4, 2, 1),
major = c("Act", "Act", "Fin", "Act", "Marketing", "Act", "Act", "Act", "Act", "Act")
)
table( t_data$major)
##
## Act Fin Marketing
## 8 1 1
# To cleanup, use ifelse
clean_t_data <- t_data %>%
mutate(major_not_accounting01 = ifelse(major == 'Fin' | major == "Marketing", 1, 0))
print(clean_t_data)