• 周四. 8月 11th, 2022

5G编程聚合网

5G时代下一个聚合的编程学习网

热门标签

线性回归算法Sklearn完整复现

admin

11月 28, 2021

1. 模型优化

1.1 多项式与线性回归

若线性回归模型太简单导致欠拟合时,我们可以增加特征多项式来让线性回归模型更好地拟合数据。比如有两个特征x1,x2,可以增加两特征的乘积作为新特征x3。还可以增加x1^2作为另一个新特征x4

scikit-learn里,线性回归是由类sklearn.linear_model.LinearRegression实现,多项式由类sklearn.preprocessing.PolynomialFeatures实现。添加多项式特征需要一个管道把两个类串起来,要使用sklearn.pipline.Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

def polynomial_model(degree = 1):
    polynomial_features = PolynomialFeatures(degree = degree,
                                            include_bias = False)
    linear_regression = LinearRegression()
    #这是一个流水线,先增加多项式阶数,然后再用线性回归算法来拟合数据
    pipline = Pipeline([("polynomial_features",polynomial_features),
                       ("linear_regression",linear_regression)])
    return pipline

fitfit_transformtransform的区别详解:https://blog.csdn.net/weixin_38278334/article/details/82971752

scikit-learn里,使用LinearRegression进行线性回归时,可以指定normalize = True来对数据进行归一化处理。

2. 示例:使用线性回归算法拟合正弦函数

#生成200个在[-2Π,2Π]区间内的正弦函数上的点,并给这些点加上随机噪声
import numpy as np
n_dots = 200

X = np.linspace(-2 * np.pi,2 * np.pi,n_dots)
Y = np.sin(X) + 0.2 * np.random.rand(n_dots) - 0.1
X = X.reshape(-1,1)
Y = Y.reshape(-1,1)
#分别用2,3,5,10阶多项式来拟合数据集
from sklearn.metrics import mean_squared_error

degrees = [2,3,5,10]
results = []
for d in degrees:
    model = polynomial_model(degree=d)
    model.fit(X,Y)
    train_score = model.score(X,Y)
    mse = mean_squared_error(Y,model.predict(X))
    results.append({"model":model,"degree":d,"score":
                       train_score,"mse":mse})
for r in results:
    print("degree: {};train score: {};mean squared error: {}".format(
        r["degree"],r["score"],r["mse"]))
degree: 2;train score: 0.14691964884268827;mean squared error: 0.4337561603823593
degree: 3;train score: 0.2725519790368923;mean squared error: 0.3698773040811927
degree: 5;train score: 0.8949982058380093;mean squared error: 0.053389079946778877
degree: 10;train score: 0.9936659355081904;mean squared error: 0.0032206104499468945
results
[{'model': Pipeline(steps=[('polynomial_features', PolynomialFeatures(include_bias=False)),
                  ('linear_regression', LinearRegression())]),
  'degree': 2,
  'score': 0.14691964884268827,
  'mse': 0.4337561603823593},
 {'model': Pipeline(steps=[('polynomial_features',
                   PolynomialFeatures(degree=3, include_bias=False)),
                  ('linear_regression', LinearRegression())]),
  'degree': 3,
  'score': 0.2725519790368923,
  'mse': 0.3698773040811927},
 {'model': Pipeline(steps=[('polynomial_features',
                   PolynomialFeatures(degree=5, include_bias=False)),
                  ('linear_regression', LinearRegression())]),
  'degree': 5,
  'score': 0.8949982058380093,
  'mse': 0.053389079946778877},
 {'model': Pipeline(steps=[('polynomial_features',
                   PolynomialFeatures(degree=10, include_bias=False)),
                  ('linear_regression', LinearRegression())]),
  'degree': 10,
  'score': 0.9936659355081904,
  'mse': 0.0032206104499468945}]

使用mean_squared_error算出均方根误差,即实际的点和模型预测的点之间的距离,均方根误差越小说明模型拟合效果越好

#绘制不同模型拟合效果
from matplotlib.figure import SubplotParams
import matplotlib.pyplot as plt

plt.figure(figsize = (12,6),dpi = 200, subplotpars = SubplotParams(hspace = 0.3))
for i,r in enumerate(results):
    fig = plt.subplot(2,2,i+1)
    plt.xlim(-8,8)
    plt.title("LinearRegression degree={}".format(r['degree']))
    plt.scatter(X,Y,s = 5,c = 'b',alpha = 0.5)
    plt.plot(X,r['model'].predict(X),'r-')


3. 示例:测算房价

使用scikit-learn自带的波士顿房价数据集来训练模型,然后用模型来测算房价,

数据集收集的13个特征:

  • CRIM:城镇人均犯罪率。
  • ZN:城镇超过25,000平方英尺的住宅区域的占地比例。
  • INDUS:城镇非零售用地占地比例。
  • CHAS:是否靠近河边,1为靠近,0为远离。
  • NOX:一氧化氮浓度。
  • RM:每套房产的平均房间个数。
  • AGE:在1940年之前就盖好,且业主自住的房子的比例。
  • DIS:与波士顿市中心的距离。
  • RAD:周边高速公道的便利性指数。
  • TAX:每10,000美元的财产税率。
  • PTRATIO:小学老师的比例。
  • B:城镇黑人的比例。
  • LSTAT:地位较低的人口比例。
#导入数据
from sklearn.datasets import load_boston

boston = load_boston()
X = boston.data
y = boston.target
X.shape
(506, 13)
X[0]
array([6.320e-03, 1.800e+01, 2.310e+00, 0.000e+00, 5.380e-01, 6.575e+00,
       6.520e+01, 4.090e+00, 1.000e+00, 2.960e+02, 1.530e+01, 3.969e+02,
       4.980e+00])
#查看特征标签
boston.feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

3.1 模型训练

#将数据集分成两份
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2
                                                 ,random_state = 2)
#训练模型并测试模型的准确性评分
import time
from sklearn.linear_model import LinearRegression

model = LinearRegression()

start = time.perf_counter()
model.fit(X_train,y_train)
cv_score = model.score(X_test,y_test)
print('elaspe: {0:.6f};train_score: {1:0.6f};cv_score: {2:.6f}'.format(
    time.perf_counter() - start,train_score,cv_score))
elaspe: 0.001908;train_score: 0.993666;cv_score: 0.778921

3.2 模型优化

#数据归一化
model = LinearRegression(normalize = True)

数据归一化处理只会加快算法收敛速度,优化算法训练效率,无法提升算法的准确性。

#增加多项式特征,增加模型的复杂度
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

def polynomial_model(degree = 1):
    polynomial_features = PolynomialFeatures(degree = degree,
                                             include_bias = False)
    linear_regression = LinearRegression(normalize = True)
    pipeline = Pipeline([("polynomial_features",polynomial_features),(
        "linear_regression",linear_regression)])
    return pipeline
#二阶多项式拟合数据
model = polynomial_model(degree = 2)

start = time.perf_counter()
model.fit(X_train,y_train)

train_score = model.score(X_train,y_train)
cv_score = model.score(X_test,y_test)
print('elaspe: {0:.6f};train_score: {1:0.6f};cv_score: {2:.6f}'.format(
        time.perf_counter() - start,train_score,cv_score))
elaspe: 0.034632;train_score: 0.929593;cv_score: 0.896364
#三阶多项式拟合数据
model = polynomial_model(degree = 3)

start = time.perf_counter()
model.fit(X_train,y_train)

train_score = model.score(X_train,y_train)
cv_score = model.score(X_test,y_test)
print('elaspe: {0:.6f};train_score: {1:0.6f};cv_score: {2:.6f}'.format(
        time.perf_counter() - start,train_score,cv_score))
elaspe: 0.161353;train_score: 1.000000;cv_score: -318.549144

三阶多项式出现了过拟合现象

总共有13个输入特征,从一阶变成二阶多项式输入特征个数增加了几个?

3.3 学习曲线

from common.utils import plot_learning_curve
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(18, 4))
title = 'Learning Curves (degree={0})'
degrees = [1, 2, 3]

start = time.clock()
plt.figure(figsize=(18, 4), dpi=200)
for i in range(len(degrees)):
    plt.subplot(1, 3, i + 1)
    plot_learning_curve(plt, polynomial_model(degrees[i]), title.format(degrees[i]), X, y, ylim=(0.01, 1.01), cv=cv)

print('elaspe: {0:.6f}'.format(time.clock()-start))

发表评论

您的电子邮箱地址不会被公开。