-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSimple Linear Regression.R
69 lines (49 loc) · 2.67 KB
/
Simple Linear Regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# SLR: y = b0+ b1*x1
# y is Dependent Variable, b0 is Constant, b1 is Coefficient, x1 us Independent Variable.
# Constant is the point where the line crosses the vertical axis.
# B1 is the Slope of the line.
# ------------------------------------------------ Importing Data -------------------------------------------- #
Salary_Data = read.csv("Salary_Data.csv")
# ------------------------- Splitting the Dataset into the Training set and Testing Set ---------------------- #
# install.packages("caTools") <----- Remove comment if not installed
library(caTools)
set.seed(123)
# In Python we put the percentage for Test Set, in R we put for Training Set.
split = sample.split(Salary_Data$Salary, SplitRatio = 0.8)
split
# True mean observation goes to Training Set and False means observation goes to Test Set.
train_set = subset(Salary_Data, split == TRUE)
train_set
test_set = subset(Salary_Data, split == FALSE)
test_set
# ----------------------------- Fitting Simple Linear Regression to the Training Set ------------------------- #
reg = lm(Salary ~ YearsExperience, data = train_set)
summary(reg)
# Most important things are p-value and significance level, because these help us about the statistical
# sifnificance of the independent variable onto the dependent variable.
# The lower the p-value is and the most statistic significant independent variable is going to be.
# If the p-valu eis lower than the 5% then that means that dependent variable would be highly statistically
# significant and more than 5% then the less it will be statistically signifiant.
# ------------------------------------------ Predicting the Test Set results ---------------------------------- #
y_pred = predict(reg, newdata = test_set)
y_pred
# ------------------------------------------- Visualising the Training Set ------------------------------------ #
# install.packages("ggplot2") <----- Remove comment if not installed
library(ggplot2)
ggplot() +
geom_point(aes(x = train_set$YearsExperience, y = train_set$Salary),
colour = "red") +
geom_line(aes(x = train_set$YearsExperience, y = predict(reg, newdata = train_set)),
color = "blue") +
ggtitle("Salary vs Experience (Trainging Set)") +
xlab("Years of Experience") +
ylab("Salary")
# --------------------------------------------- Visualising the Test Set -------------------------------------- #
ggplot() +
geom_point(aes(x = test_set$YearsExperience, y = test_set$Salary),
colour = "red") +
geom_line(aes(x = train_set$YearsExperience, y = predict(reg, newdata = train_set)),
color = "blue") +
ggtitle("Salary vs Experience (Test Set)") +
xlab("Years of Experience") +
ylab("Salary")