-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathch_02.R
180 lines (127 loc) · 4.79 KB
/
ch_02.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
library(ggplot2)
library(broom)
unemployment <- readRDS('unemployment.rds')
fmla <- as.formula("female_unemployment ~ male_unemployment")
unemployment_model <- lm(formula = fmla, data = unemployment)
# unemployment, unemployment_model are in the workspace
summary(unemployment)
summary(unemployment_model)
# Make predictions from the model
unemployment$predictions <- predict(unemployment_model)
# Fill in the blanks to plot predictions (on x-axis) versus the female_unemployment rates
ggplot(unemployment, aes(x = predictions, y = female_unemployment)) +
geom_point() +
geom_abline()
# unemployment is in the workspace (with predictions)
summary(unemployment)
# unemployment_model is in the workspace
summary(unemployment_model)
# Load the package WVPlots
library(WVPlots)
# Plot the Gain Curve
GainCurvePlot(unemployment, "predictions", "female_unemployment", "Unemployment model")
# unemployment is in the workspace
summary(unemployment)
# For convenience put the residuals in the variable res
res <- unemployment$residuals
# Calculate RMSE, assign it to the variable rmse and print it
(rmse <- sqrt(mean(res ** 2)))
# Calculate the standard deviation of female_unemployment and print it
(sd_unemployment <- sd(unemployment$female_unemployment))
# unemployment is in your workspace
summary(unemployment)
# unemployment_model is in the workspace
summary(unemployment_model)
# Calculate mean female_unemployment: fe_mean. Print it
(fe_mean <- mean(unemployment$female_unemployment))
print(fe_mean)
# Calculate total sum of squares: tss. Print it
(tss <- sum((unemployment$female_unemployment - fe_mean) ^ 2))
# Calculate residual sum of squares: rss. Print it
(rss <- sum(unemployment$residuals ** 2))
# Calculate R-squared: rsq. Print it. Is it a good fit?
(rsq <- 1 - rss / tss)
# Get R-squared from glance. Print it
(rsq_glance <- glance(unemployment_model)$r.squared)
# unemployment is in your workspace
summary(unemployment)
# unemployment_model is in the workspace
summary(unemployment_model)
# Get the correlation between the prediction and true outcome: rho and print it
(rho <- cor(unemployment$female_unemployment, unemployment$predictions))
# Square rho: rho2 and print it
(rho2 <- rho ** 2)
# Get R-squared from glance and print it
(rsq_glance <- glance(unemployment_model)$r.squared)
# mpg is in the workspace
summary(mpg)
dim(mpg)
# Use nrow to get the number of rows in mpg (N) and print it
(N <- nrow(mpg))
# Calculate how many rows 75% of N should be and print it
# Hint: use round() to get an integer
(target <- round(N * .75))
# Create the vector of N uniform random variables: gp
gp <- runif(N)
# Use gp to create the training set: mpg_train (75% of data) and mpg_test (25% of data)
mpg_train <- mpg[gp < .75,]
mpg_test <- mpg[gp >= .75,]
# Use nrow() to examine mpg_train and mpg_test
nrow(mpg_train)
nrow(mpg_test)
# mpg_train is in the workspace
summary(mpg_train)
# Create a formula to express cty as a function of hwy: fmla and print it.
(fmla <- as.formula("cty ~ hwy"))
# Now use lm() to build a model mpg_model from mpg_train that predicts cty from hwy
mpg_model <- lm(fmla, mpg_train)
# Use summary() to examine the model
summary(mpg_model)
# Examine the objects in the workspace
ls.str()
# predict cty from hwy for the training set
mpg_train$pred <- predict(mpg_model, mpg_train)
# predict cty from hwy for the test set
mpg_test$pred <- predict(mpg_model, mpg_test)
# Evaluate the rmse on both training and test data and print them
(rmse_train <- rmse(mpg_train$pred, mpg_train$cty))
(rmse_test <- rmse(mpg_test$pred, mpg_test$cty))
# Evaluate the r-squared on both training and test data.and print them
(rsq_train <- r_squared(mpg_train$pred, mpg_train$cty))
(rsq_test <- r_squared(mpg_test$pred, mpg_test$cty))
# Plot the predictions (on the x-axis) against the outcome (cty) on the test data
ggplot(mpg_test, aes(x = pred, y = cty)) +
geom_point() +
geom_abline()
# Load the package vtreat
library(vtreat)
rmse <- function(predcol, ycol) {
res = predcol - ycol
sqrt(mean(res ^ 2))
}
# mpg is in the workspace
summary(mpg)
# Get the number of rows in mpg
nRows <- nrow(mpg)
# Implement the 3-fold cross-fold plan with vtreat
splitPlan <- kWayCrossValidation(nRows, 3, NULL, NULL)
# Examine the split plan
str(splitPlan)
# mpg is in the workspace
summary(mpg)
# splitPlan is in the workspace
str(splitPlan)
# Run the 3-fold cross validation plan from splitPlan
k <- 3 # Number of folds
mpg$pred.cv <- 0
for (i in 1:k) {
split <- splitPlan[[i]]
model <- lm(formula = cty ~ hwy, data = mpg[split$train,])
mpg$pred.cv[split$app] <- predict(model, newdata = mpg[split$app,])
}
# Predict from a full model
mpg$pred <- predict(lm(cty ~ hwy, data = mpg))
# Get the rmse of the full model's predictions
rmse(mpg$pred, mpg$cty)
# Get the rmse of the cross-validation predictions
rmse(mpg$pred.cv, mpg$cty)