SlideShare a Scribd company logo
Introduction to Statistical Learning
Chapter 3
Tarek Dib
tdib03@gmail.com
April 12, 2015
1 Data Manipulation
setwd("/home/tarek/ISLR/dataSets")
auto <- read.csv("auto.csv")
# Structure of the data set
str(auto)
## 'data.frame': 397 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : Factor w/ 94 levels "?","100","102",..: 17 35 29 29 24 42 47 46 48 40 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2
# Remove the "name" column
auto <- auto[,-9]
# Remove rows where horsepower is missing
auto <- auto[auto$horsepower != "?",]
# Convert horsepower to numeric
auto$horsepower <- as.numeric(as.character(auto$horsepower))
# Change cylinders into a factor
auto$cylinders <- as.factor(auto$cylinders)
# Convert year to a factor data type
auto$year <- factor(auto$year)
2 Exploratory Data Analysis
1
library(ggplot2)
p.weight <- ggplot(auto, aes(weight, mpg, colour=cylinders))
p.weight + geom_point(size=2) + facet_grid(.~ cylinders) +
ggtitle("Mileage vs. Weight Grouped By Cylinders")
3 4 5 6 8
10
20
30
40
2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000
weight
mpg
cylinders
3
4
5
6
8
Mileage vs. Weight Grouped By Cylinders
p.HP <- ggplot(auto, aes(horsepower, mpg, colour=cylinders))
p.HP + geom_point(size=2) + facet_grid(.~ cylinders) +
ggtitle("Mileage vs. Horsepower")
3 4 5 6 8
10
20
30
40
50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200
horsepower
mpg
cylinders
3
4
5
6
8
Mileage vs. Horsepower
p.mpgHP <- ggplot(auto, aes(horsepower, mpg))
p.mpgHP + geom_point() + stat_smooth(method="lm", se=F)
10
20
30
40
50 100 150 200
horsepower
mpg
2
p.year <- ggplot(auto, aes(year, mpg, colour=cylinders))
p.year + geom_boxplot() +
ggtitle("Mileage change over the years Grouped by Cylinders")
10
20
30
40
70 71 72 73 74 75 76 77 78 79 80 81 82
year
mpg
cylinders
3
4
5
6
8
Mileage change over the years Grouped by Cylinders
plt1 <- ggplot(auto, aes(cylinders, mpg))
plt1 + geom_boxplot()
10
20
30
40
3 4 5 6 8
cylinders
mpg
3 Simple Linear Regression, mpg vs horsepower
lm.fit1 <- lm(mpg ~ horsepower, data = auto)
s <- summary(lm.fit1)
library(xtable)
summaryTab <- xtable(s, caption = "Simple Linear Regression Model, Summary Tablelabel{tab:mpg vs. hors
,label="tab:mpg vs. horsepower")
Rsquared.simple <- s$r.squared
# Predict mpg for horsepower = 98, and its 95% confidence and prediction interval, respectively
predict(lm.fit1, newdata=data.frame(horsepower=98), interval="confidence")
## fit lwr upr
## 1 24.46708 23.97308 24.96108
3
predict(lm.fit1, newdata=data.frame(horsepower=98), interval="prediction")
## fit lwr upr
## 1 24.46708 14.8094 34.12476
p.mpgHP + geom_point() + stat_smooth(method="lm", se=F) +
geom_abline(intercept = coef(lm.fit1)[1], slope=coef(lm.fit1)[2])
10
20
30
40
50 100 150 200
horsepower
mpg
Table 1: Simple Linear Regression Model, Summary Table
Estimate Std. Error t value Pr(>|t|)
(Intercept) 39.9359 0.7175 55.66 0.0000
horsepower -0.1578 0.0064 -24.49 0.0000
4 Removing Cylinders 3 and 5
Since there are only 4 cars with 3 cylinders and 3 cars with 5 cylinders, I have decided to exclude cars with 3 and 5
cylinders, and focus on presenting and analyzing cars with 4, 6 and 8 cylinders.
# Exclude 3 and 5 cylinder cars
auto <- auto[auto$cylinders != 3 & auto$cylinders != 5,]
# plot it
h1 <- ggplot(auto, aes(horsepower, mpg, colour = cylinders))
h1 + geom_point() + facet_grid(. ~ cylinders)
4
4 6 8
10
20
30
40
50 100 150 200 50 100 150 200 50 100 150 200
horsepower
mpg
cylinders
4
6
8
plt3 <- ggplot(auto, aes(weight, mpg, colour = cylinders))
plt3 + geom_point() +
facet_grid(. ~ cylinders) +
ggtitle("MPG vs. Weight Grouped by Cylinders") +
stat_smooth(method="lm", se=F)
4 6 8
10
20
30
40
2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000
weight
mpg
cylinders
4
6
8
MPG vs. Weight Grouped by Cylinders
5 Correlation Matrix
# Create a dataframe with the numeric variables: mpg, displacement, horsepower, weight, acceleration
df1 <- auto[,c("mpg", "displacement", "horsepower", "weight", "acceleration")]
# Correlation matrix
Cor <- cor(df1)
CorTab <- xtable(Cor, caption = "Correlation Matrixlabel{tab:Correlation}"
,label="Correlation")
6 Multiple Linear Regression Model
# Multi linear regression model
lm.fit2 <- lm(mpg ~ weight + horsepower + acceleration + cylinders + cylinders:horsepower, data=auto)
5
Table 2: Correlation Matrix
mpg displacement horsepower weight acceleration
mpg 1.00 -0.82 -0.78 -0.84 0.42
displacement -0.82 1.00 0.90 0.94 -0.56
horsepower -0.78 0.90 1.00 0.87 -0.69
weight -0.84 0.94 0.87 1.00 -0.43
acceleration 0.42 -0.56 -0.69 -0.43 1.00
# Estimate Variance Inflation Factor (vif) found in the car package.
# If vif >= 10, then remove the predictor for multicollinearity issue
library(car)
vifTab <- xtable(vif(lm.fit2))
s.fit2 <- summary(lm.fit2)
s.fit2.Tab <- xtable(s.fit2, caption = "Multiple Linear Regression Model, Summary Table")
Rsquared.multi <- s.fit2$r.squared
print(vifTab)
GVIF Df GVIF^(1/(2*Df))
weight 10.14 1.00 3.18
horsepower 26.99 1.00 5.20
acceleration 2.81 1.00 1.68
cylinders 2129.64 2.00 6.79
horsepower:cylinders 3527.06 2.00 7.71
print(s.fit2.Tab, caption.placement="top")
Table 3: Multiple Linear Regression Model, Summary Table
Estimate Std. Error t value Pr(>|t|)
(Intercept) 57.4208 2.7576 20.82 0.0000
weight -0.0027 0.0007 -3.76 0.0002
horsepower -0.2141 0.0256 -8.38 0.0000
acceleration -0.3145 0.1161 -2.71 0.0071
cylinders6 -22.2642 3.5438 -6.28 0.0000
cylinders8 -18.9358 2.9615 -6.39 0.0000
horsepower:cylinders6 0.1992 0.0356 5.60 0.0000
horsepower:cylinders8 0.1607 0.0240 6.69 0.0000
7 Model Diagnostics
6
−10
0
10
10 20 30
fitted
residuals
Figure 1: Residuals vs. Fitted Values. The gure proves that the homoscedasticity assumption i.e. constant variance
is violated. Thus, the multivariate linear regression model developed above seems not to be suitable. The response
variable may need to be transformed and then retted. Log transfomation of the response (mpg) variable may rectify
the non constant variance.
−10
0
10
−2 0 2
theoretical
sample
Figure 2: qqnorm and qqline plots to test for the normality assumption. The data do not seem to deviate much from
the normality assumption. There seems to be few outliers in the data set.
7
0
20
40
60
−10 0 10 20
residuals
count
0.00
0.05
0.10
0.15
0.20
0.25
0 100 200 300 400
Index
Leverages
Index plot of Leverages
8 Modifying the model
lm.fit3 - lm(log(mpg) ~ weight + horsepower + acceleration + cylinders +
cylinders * horsepower, data=auto)
summary(lm.fit3)
##
## Call:
## lm(formula = log(mpg) ~ weight + horsepower + acceleration +
## cylinders + cylinders * horsepower, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36326 -0.08544 -0.00528 0.08195 0.63418
##
8
## Coefficients:
## Estimate Std. Error t value Pr(|t|)
## (Intercept) 4.421e+00 1.063e-01 41.613  2e-16 ***
## weight -1.365e-04 2.738e-05 -4.986 9.40e-07 ***
## horsepower -6.784e-03 9.846e-04 -6.890 2.35e-11 ***
## acceleration -1.308e-02 4.473e-03 -2.924 0.00366 **
## cylinders6 -7.404e-01 1.365e-01 -5.422 1.05e-07 ***
## cylinders8 -4.742e-01 1.141e-01 -4.156 4.01e-05 ***
## horsepower:cylinders6 6.271e-03 1.371e-03 4.574 6.50e-06 ***
## horsepower:cylinders8 3.456e-03 9.263e-04 3.731 0.00022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1439 on 377 degrees of freedom
## Multiple R-squared: 0.8259,Adjusted R-squared: 0.8227
## F-statistic: 255.5 on 7 and 377 DF, p-value:  2.2e-16
par(mfrow=c(2,2))
plot(lm.fit3)
2.4 2.6 2.8 3.0 3.2 3.4 3.6
−0.40.6
Fitted values
Residuals
Residuals vs Fitted
387
361365
−3 −2 −1 0 1 2 3
−24
Theoretical Quantiles
Standardizedresiduals
Normal Q−Q
387
361365
2.4 2.6 2.8 3.0 3.2 3.4 3.6
0.01.5
Fitted values
Standardizedresiduals
Scale−Location
387
361365
0.00 0.05 0.10 0.15 0.20 0.25
−24
Leverage
Standardizedresiduals
Cook's distance
0.5
0.5
1
Residuals vs Leverage
334361
387
Rsquared.mod - summary(lm.fit3)$r.squared
8.1 Residuals and QQ Norm of Modied Model
par(mfrow=c(3,1))
plot(lm.fit3$fit, lm.fit3$res, xlab=Fitted Values, ylab=Residuals)
hist(resid(lm.fit3))
qqnorm(resid(lm.fit3))
qqline(resid(lm.fit3))
9
2.4 2.6 2.8 3.0 3.2 3.4 3.6
−0.40.00.20.40.6
Fitted Values
Residuals
Histogram of resid(lm.fit3)
resid(lm.fit3)
Frequency
−0.4 −0.2 0.0 0.2 0.4 0.6
020406080
−3 −2 −1 0 1 2 3
−0.40.00.20.40.6
Normal Q−Q Plot
Theoretical Quantiles
SampleQuantiles
10
9 Comparison of Rsquared value among the Models
# Dataframe of the rsquareds
df2 - data.frame(rbind(Rsquared.simple, Rsquared.multi, Rsquared.mod))
names(df2) - Rsquared
rownames(df2) - c(Simple, Multiple, Modified Multiple)
dfTab - xtable(df2)
Rsquared
Simple 0.61
Multiple 0.78
Modied Multiple 0.83
11

More Related Content

PDF
Caterpillar cat dp45 k forklift lift trucks service repair manual snet19c 800...
PDF
Math cad prime ncees excercise 444 solution
DOCX
Plug-In Hybrid Simulation
PPTX
Presentation Acme engineering- Two Stage Turbo Shaft Engine- Pratt and Whittney
PDF
Design of a Gear Reducer for a Tractor
PDF
Logistic Regression, Linear and Quadratic Discriminant Analyses, and KNN
DOCX
Logistic Regression, Linear and Quadratic Discriminant Analysis and K-Nearest...
PDF
MH prediction modeling and validation in r (1) regression 190709
Caterpillar cat dp45 k forklift lift trucks service repair manual snet19c 800...
Math cad prime ncees excercise 444 solution
Plug-In Hybrid Simulation
Presentation Acme engineering- Two Stage Turbo Shaft Engine- Pratt and Whittney
Design of a Gear Reducer for a Tractor
Logistic Regression, Linear and Quadratic Discriminant Analyses, and KNN
Logistic Regression, Linear and Quadratic Discriminant Analysis and K-Nearest...
MH prediction modeling and validation in r (1) regression 190709

Similar to Applied Regression Analysis using R (20)

PDF
Regression_Class_Project_-_MTCARS
PDF
Relentless Regression
PPTX
AUTO MPG Regression Analysis
PPTX
Auto MPG Regression Analysis
PDF
Simple Linear Regression with R
DOCX
Assignment 2 linear regression predicting car mpg
PDF
Introduction to tibbles
PPTX
DS.pptx
PDF
Multiple Regression
PPTX
Data analytics Lecture power point presentations
PDF
Deriving insights from data using "R"ight way
PDF
Manipulating Data using base R package
PPTX
Lab practice session.pptx
PDF
Kenny_Automobile_EDA.pdf
PDF
R analysis of covariance
PDF
AutomobileDataAnalysis.pdf
PPTX
Introduction to Regression Analysis and R
PPTX
Data manipulation and visualization in r 20190711 myanmarucsy
PDF
CarProject
PDF
Multiple Linear Regression Applications Automobile Pricing
Regression_Class_Project_-_MTCARS
Relentless Regression
AUTO MPG Regression Analysis
Auto MPG Regression Analysis
Simple Linear Regression with R
Assignment 2 linear regression predicting car mpg
Introduction to tibbles
DS.pptx
Multiple Regression
Data analytics Lecture power point presentations
Deriving insights from data using "R"ight way
Manipulating Data using base R package
Lab practice session.pptx
Kenny_Automobile_EDA.pdf
R analysis of covariance
AutomobileDataAnalysis.pdf
Introduction to Regression Analysis and R
Data manipulation and visualization in r 20190711 myanmarucsy
CarProject
Multiple Linear Regression Applications Automobile Pricing
Ad

Recently uploaded (20)

PPTX
The THESIS FINAL-DEFENSE-PRESENTATION.pptx
PPTX
Introduction-to-Cloud-ComputingFinal.pptx
PDF
22.Patil - Early prediction of Alzheimer’s disease using convolutional neural...
PPT
Reliability_Chapter_ presentation 1221.5784
PPTX
Computer network topology notes for revision
PDF
annual-report-2024-2025 original latest.
PDF
Foundation of Data Science unit number two notes
PPTX
01_intro xxxxxxxxxxfffffffffffaaaaaaaaaaafg
PPTX
Acceptance and paychological effects of mandatory extra coach I classes.pptx
PPTX
IB Computer Science - Internal Assessment.pptx
PPTX
ALIMENTARY AND BILIARY CONDITIONS 3-1.pptx
PDF
BF and FI - Blockchain, fintech and Financial Innovation Lesson 2.pdf
PPTX
STUDY DESIGN details- Lt Col Maksud (21).pptx
PPTX
iec ppt-1 pptx icmr ppt on rehabilitation.pptx
PPTX
advance b rammar.pptxfdgdfgdfsgdfgsdgfdfgdfgsdfgdfgdfg
PPTX
AI Strategy room jwfjksfksfjsjsjsjsjfsjfsj
PPTX
Introduction to Firewall Analytics - Interfirewall and Transfirewall.pptx
PPTX
Business Acumen Training GuidePresentation.pptx
PDF
Galatica Smart Energy Infrastructure Startup Pitch Deck
The THESIS FINAL-DEFENSE-PRESENTATION.pptx
Introduction-to-Cloud-ComputingFinal.pptx
22.Patil - Early prediction of Alzheimer’s disease using convolutional neural...
Reliability_Chapter_ presentation 1221.5784
Computer network topology notes for revision
annual-report-2024-2025 original latest.
Foundation of Data Science unit number two notes
01_intro xxxxxxxxxxfffffffffffaaaaaaaaaaafg
Acceptance and paychological effects of mandatory extra coach I classes.pptx
IB Computer Science - Internal Assessment.pptx
ALIMENTARY AND BILIARY CONDITIONS 3-1.pptx
BF and FI - Blockchain, fintech and Financial Innovation Lesson 2.pdf
STUDY DESIGN details- Lt Col Maksud (21).pptx
iec ppt-1 pptx icmr ppt on rehabilitation.pptx
advance b rammar.pptxfdgdfgdfsgdfgsdgfdfgdfgsdfgdfgdfg
AI Strategy room jwfjksfksfjsjsjsjsjfsjfsj
Introduction to Firewall Analytics - Interfirewall and Transfirewall.pptx
Business Acumen Training GuidePresentation.pptx
Galatica Smart Energy Infrastructure Startup Pitch Deck
Ad

Applied Regression Analysis using R

  • 1. Introduction to Statistical Learning Chapter 3 Tarek Dib tdib03@gmail.com April 12, 2015 1 Data Manipulation setwd("/home/tarek/ISLR/dataSets") auto <- read.csv("auto.csv") # Structure of the data set str(auto) ## 'data.frame': 397 obs. of 9 variables: ## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ... ## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ... ## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ... ## $ horsepower : Factor w/ 94 levels "?","100","102",..: 17 35 29 29 24 42 47 46 48 40 ... ## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ... ## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ... ## $ year : int 70 70 70 70 70 70 70 70 70 70 ... ## $ origin : int 1 1 1 1 1 1 1 1 1 1 ... ## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 # Remove the "name" column auto <- auto[,-9] # Remove rows where horsepower is missing auto <- auto[auto$horsepower != "?",] # Convert horsepower to numeric auto$horsepower <- as.numeric(as.character(auto$horsepower)) # Change cylinders into a factor auto$cylinders <- as.factor(auto$cylinders) # Convert year to a factor data type auto$year <- factor(auto$year) 2 Exploratory Data Analysis 1
  • 2. library(ggplot2) p.weight <- ggplot(auto, aes(weight, mpg, colour=cylinders)) p.weight + geom_point(size=2) + facet_grid(.~ cylinders) + ggtitle("Mileage vs. Weight Grouped By Cylinders") 3 4 5 6 8 10 20 30 40 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 weight mpg cylinders 3 4 5 6 8 Mileage vs. Weight Grouped By Cylinders p.HP <- ggplot(auto, aes(horsepower, mpg, colour=cylinders)) p.HP + geom_point(size=2) + facet_grid(.~ cylinders) + ggtitle("Mileage vs. Horsepower") 3 4 5 6 8 10 20 30 40 50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200 horsepower mpg cylinders 3 4 5 6 8 Mileage vs. Horsepower p.mpgHP <- ggplot(auto, aes(horsepower, mpg)) p.mpgHP + geom_point() + stat_smooth(method="lm", se=F) 10 20 30 40 50 100 150 200 horsepower mpg 2
  • 3. p.year <- ggplot(auto, aes(year, mpg, colour=cylinders)) p.year + geom_boxplot() + ggtitle("Mileage change over the years Grouped by Cylinders") 10 20 30 40 70 71 72 73 74 75 76 77 78 79 80 81 82 year mpg cylinders 3 4 5 6 8 Mileage change over the years Grouped by Cylinders plt1 <- ggplot(auto, aes(cylinders, mpg)) plt1 + geom_boxplot() 10 20 30 40 3 4 5 6 8 cylinders mpg 3 Simple Linear Regression, mpg vs horsepower lm.fit1 <- lm(mpg ~ horsepower, data = auto) s <- summary(lm.fit1) library(xtable) summaryTab <- xtable(s, caption = "Simple Linear Regression Model, Summary Tablelabel{tab:mpg vs. hors ,label="tab:mpg vs. horsepower") Rsquared.simple <- s$r.squared # Predict mpg for horsepower = 98, and its 95% confidence and prediction interval, respectively predict(lm.fit1, newdata=data.frame(horsepower=98), interval="confidence") ## fit lwr upr ## 1 24.46708 23.97308 24.96108 3
  • 4. predict(lm.fit1, newdata=data.frame(horsepower=98), interval="prediction") ## fit lwr upr ## 1 24.46708 14.8094 34.12476 p.mpgHP + geom_point() + stat_smooth(method="lm", se=F) + geom_abline(intercept = coef(lm.fit1)[1], slope=coef(lm.fit1)[2]) 10 20 30 40 50 100 150 200 horsepower mpg Table 1: Simple Linear Regression Model, Summary Table Estimate Std. Error t value Pr(>|t|) (Intercept) 39.9359 0.7175 55.66 0.0000 horsepower -0.1578 0.0064 -24.49 0.0000 4 Removing Cylinders 3 and 5 Since there are only 4 cars with 3 cylinders and 3 cars with 5 cylinders, I have decided to exclude cars with 3 and 5 cylinders, and focus on presenting and analyzing cars with 4, 6 and 8 cylinders. # Exclude 3 and 5 cylinder cars auto <- auto[auto$cylinders != 3 & auto$cylinders != 5,] # plot it h1 <- ggplot(auto, aes(horsepower, mpg, colour = cylinders)) h1 + geom_point() + facet_grid(. ~ cylinders) 4
  • 5. 4 6 8 10 20 30 40 50 100 150 200 50 100 150 200 50 100 150 200 horsepower mpg cylinders 4 6 8 plt3 <- ggplot(auto, aes(weight, mpg, colour = cylinders)) plt3 + geom_point() + facet_grid(. ~ cylinders) + ggtitle("MPG vs. Weight Grouped by Cylinders") + stat_smooth(method="lm", se=F) 4 6 8 10 20 30 40 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 weight mpg cylinders 4 6 8 MPG vs. Weight Grouped by Cylinders 5 Correlation Matrix # Create a dataframe with the numeric variables: mpg, displacement, horsepower, weight, acceleration df1 <- auto[,c("mpg", "displacement", "horsepower", "weight", "acceleration")] # Correlation matrix Cor <- cor(df1) CorTab <- xtable(Cor, caption = "Correlation Matrixlabel{tab:Correlation}" ,label="Correlation") 6 Multiple Linear Regression Model # Multi linear regression model lm.fit2 <- lm(mpg ~ weight + horsepower + acceleration + cylinders + cylinders:horsepower, data=auto) 5
  • 6. Table 2: Correlation Matrix mpg displacement horsepower weight acceleration mpg 1.00 -0.82 -0.78 -0.84 0.42 displacement -0.82 1.00 0.90 0.94 -0.56 horsepower -0.78 0.90 1.00 0.87 -0.69 weight -0.84 0.94 0.87 1.00 -0.43 acceleration 0.42 -0.56 -0.69 -0.43 1.00 # Estimate Variance Inflation Factor (vif) found in the car package. # If vif >= 10, then remove the predictor for multicollinearity issue library(car) vifTab <- xtable(vif(lm.fit2)) s.fit2 <- summary(lm.fit2) s.fit2.Tab <- xtable(s.fit2, caption = "Multiple Linear Regression Model, Summary Table") Rsquared.multi <- s.fit2$r.squared print(vifTab) GVIF Df GVIF^(1/(2*Df)) weight 10.14 1.00 3.18 horsepower 26.99 1.00 5.20 acceleration 2.81 1.00 1.68 cylinders 2129.64 2.00 6.79 horsepower:cylinders 3527.06 2.00 7.71 print(s.fit2.Tab, caption.placement="top") Table 3: Multiple Linear Regression Model, Summary Table Estimate Std. Error t value Pr(>|t|) (Intercept) 57.4208 2.7576 20.82 0.0000 weight -0.0027 0.0007 -3.76 0.0002 horsepower -0.2141 0.0256 -8.38 0.0000 acceleration -0.3145 0.1161 -2.71 0.0071 cylinders6 -22.2642 3.5438 -6.28 0.0000 cylinders8 -18.9358 2.9615 -6.39 0.0000 horsepower:cylinders6 0.1992 0.0356 5.60 0.0000 horsepower:cylinders8 0.1607 0.0240 6.69 0.0000 7 Model Diagnostics 6
  • 7. −10 0 10 10 20 30 fitted residuals Figure 1: Residuals vs. Fitted Values. The gure proves that the homoscedasticity assumption i.e. constant variance is violated. Thus, the multivariate linear regression model developed above seems not to be suitable. The response variable may need to be transformed and then retted. Log transfomation of the response (mpg) variable may rectify the non constant variance. −10 0 10 −2 0 2 theoretical sample Figure 2: qqnorm and qqline plots to test for the normality assumption. The data do not seem to deviate much from the normality assumption. There seems to be few outliers in the data set. 7
  • 8. 0 20 40 60 −10 0 10 20 residuals count 0.00 0.05 0.10 0.15 0.20 0.25 0 100 200 300 400 Index Leverages Index plot of Leverages 8 Modifying the model lm.fit3 - lm(log(mpg) ~ weight + horsepower + acceleration + cylinders + cylinders * horsepower, data=auto) summary(lm.fit3) ## ## Call: ## lm(formula = log(mpg) ~ weight + horsepower + acceleration + ## cylinders + cylinders * horsepower, data = auto) ## ## Residuals: ## Min 1Q Median 3Q Max ## -0.36326 -0.08544 -0.00528 0.08195 0.63418 ## 8
  • 9. ## Coefficients: ## Estimate Std. Error t value Pr(|t|) ## (Intercept) 4.421e+00 1.063e-01 41.613 2e-16 *** ## weight -1.365e-04 2.738e-05 -4.986 9.40e-07 *** ## horsepower -6.784e-03 9.846e-04 -6.890 2.35e-11 *** ## acceleration -1.308e-02 4.473e-03 -2.924 0.00366 ** ## cylinders6 -7.404e-01 1.365e-01 -5.422 1.05e-07 *** ## cylinders8 -4.742e-01 1.141e-01 -4.156 4.01e-05 *** ## horsepower:cylinders6 6.271e-03 1.371e-03 4.574 6.50e-06 *** ## horsepower:cylinders8 3.456e-03 9.263e-04 3.731 0.00022 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.1439 on 377 degrees of freedom ## Multiple R-squared: 0.8259,Adjusted R-squared: 0.8227 ## F-statistic: 255.5 on 7 and 377 DF, p-value: 2.2e-16 par(mfrow=c(2,2)) plot(lm.fit3) 2.4 2.6 2.8 3.0 3.2 3.4 3.6 −0.40.6 Fitted values Residuals Residuals vs Fitted 387 361365 −3 −2 −1 0 1 2 3 −24 Theoretical Quantiles Standardizedresiduals Normal Q−Q 387 361365 2.4 2.6 2.8 3.0 3.2 3.4 3.6 0.01.5 Fitted values Standardizedresiduals Scale−Location 387 361365 0.00 0.05 0.10 0.15 0.20 0.25 −24 Leverage Standardizedresiduals Cook's distance 0.5 0.5 1 Residuals vs Leverage 334361 387 Rsquared.mod - summary(lm.fit3)$r.squared 8.1 Residuals and QQ Norm of Modied Model par(mfrow=c(3,1)) plot(lm.fit3$fit, lm.fit3$res, xlab=Fitted Values, ylab=Residuals) hist(resid(lm.fit3)) qqnorm(resid(lm.fit3)) qqline(resid(lm.fit3)) 9
  • 10. 2.4 2.6 2.8 3.0 3.2 3.4 3.6 −0.40.00.20.40.6 Fitted Values Residuals Histogram of resid(lm.fit3) resid(lm.fit3) Frequency −0.4 −0.2 0.0 0.2 0.4 0.6 020406080 −3 −2 −1 0 1 2 3 −0.40.00.20.40.6 Normal Q−Q Plot Theoretical Quantiles SampleQuantiles 10
  • 11. 9 Comparison of Rsquared value among the Models # Dataframe of the rsquareds df2 - data.frame(rbind(Rsquared.simple, Rsquared.multi, Rsquared.mod)) names(df2) - Rsquared rownames(df2) - c(Simple, Multiple, Modified Multiple) dfTab - xtable(df2) Rsquared Simple 0.61 Multiple 0.78 Modied Multiple 0.83 11