$ python -m pip install -U pip

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load the dataset
dataset = pd.read_csv('05_1_0_salary_data.csv')

# Check the first few rows of the dataset
dataset.head()

# Check the shape of the dataset
print("Shape of dataset:", dataset.shape)
# Check the data types of the columns
print("Data types of columns:\n", dataset.dtypes)
# Check for missing values
print("Missing values in dataset:\n", dataset.isnull().sum())

Shape of dataset: (30, 2)
Data types of columns:
 YearsExperience    float64
Salary             float64
dtype: object
Missing values in dataset:
 YearsExperience    0
Salary             0
dtype: int64

x = dataset[['YearsExperience']] # separate the feature vector YearsExperience from the dataset
y = dataset['Salary'] # separate the target variable Salary from the dataset

# Split X (feature vector) and y (actual values) into training and test datasets.
# Use test_size to set the ratio of training data to test data.
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Check the shape of the training and test datasets
print("Shape of training dataset:", X_train.shape)
print("Shape of test dataset:", X_test.shape)
print("Shape of training labels:", y_train.shape)
print("Shape of test labels:", y_test.shape)

Shape of training dataset: (24, 1)
Shape of test dataset: (6, 1)
Shape of training labels: (24,)
Shape of test labels: (6,)

# Create a (Simple) LinearRegression object
model = LinearRegression()

# Fit (train or learn) method is used to train the model.
# It includes
#   1) weight initialization,
#   2) loss function calculation,
#   3) weight update.
model.fit(X_train, y_train) # Only training data is used in the learning phase
w_1 = model.coef_
w_0 = model.intercept_
print(f'coefficent: {w_1}')
print(f'w_0: {w_0}')
print(f'y = w_1 * X + w_0 -> {w_1}X + {w_0}')

coefficent: [9312.57512673]
w_0: 26780.09915062818
y = w_1 * X + w_0 -> [9312.57512673]X + 26780.09915062818

# Evaluate the model using the test dataset
# The predict function extracts the predicted values obtained by linear regression.
y_pred = model.predict(X_test)
result = mean_squared_error(y_test, y_pred)
print(result)

12823412.298126549

# Visualize the training set results
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, model.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Train set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

# Visualize the test set results (in blue).
plt.scatter(X_test, y_test, color = 'blue')
plt.plot(X_test, model.predict(X_test), color = 'red')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

	YearsExperience	Salary
0	1.1	39343.0
1	1.3	46205.0
2	1.5	37731.0
3	2.0	43525.0
4	2.2	39891.0

Simple Linear Regression¶

Practice Workflow¶

Install dependencies¶

Import required libraries and packages

Load Dataset

2. Split the dataset into training and test sets

Build the Model

Train the Model¶

Evaluate Model Performance¶

Visualization of Model Behaivor¶