Linear Regression implemented by Python

为什么用Python?

“ 说起科学计算,首先会被提到的可能就是强大的MATLAB。然而除了MATLAB的一些专业性很强的工具箱目前还无法替代外,MATLAB的大部分常用功能都可以在Python世界中找到相应的扩展库。和MATLAB相比,用Python做科学计算有如下优点:免费、更易学、丰富扩展库 ”

摘自《Python科学计算》

在这篇文章中,我利用了Python的NumPy库,帮助我更快地实现线性回归:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np

'''
Implement Linear-Regression, using Python
'''


def loadDataSet():
    '''
    Read data from file
    Return:
        x:list, [[x0(0), x1(0)], [x0(1), x1(1)] ... [x0(m), x1(m)]]
        y:list, [y(0), y(1), ... y(m)]
    '''
    x = []
    y = []
    dataFile = open('ex1data1.txt')
    for line in dataFile:
        lineData = line.strip().split(',')
        x.append([1.0, float(lineData[0])])
        y.append(float(lineData[1]))
    return (x, y)


def h(theta, x):
    '''
    Hypothesis Function For one sample
        theta: 个数为n的一维ndarray
        x: 个数为n的一维ndarray
    Return: digit
    '''
    return theta.dot(x)


def batch_gradient_descent(alpha, theta, x, y):
    '''
    Batch-Gradient-Descent"
        alpha: Learning Rate
        x: list, [[x0(0), x1(0)], [x0(1), x1(1)] ... [x0(m), x1(m)]]
        y: list, [y(0), y(1), ... y(m)]
        theta: 默认为np.array([0]*n, dtype=np.float)
    Return:
        newTheta: 训练后的模型参数,个数为n的一维ndarray
    '''
    m, n = x.shape
    newTheta = np.array([0]*n, dtype=np.float)
    for j in range(n):
        count = 0.0
        for i in range(m):
            # x[i,:]取x第i行,形成n个元素的一维矩阵
            count += (h(theta, x[i,:]) - y[i]) * x[i, j]
        newTheta[j] = theta[j] - alpha * count / m
    return newTheta


def normal_equation(x, y):
    '''
    Normal Equation
    '''
    return np.linalg.inv(np.transpose(x).dot(x)).dot(np.transpose(x)).dot(y)


def cost_function(theta, x, y):
    """
    Cost Function
        theta: 模型参数,个数为n的一维ndarray
        x: m*n的二维ndarray
        y: 个数为m的一维ndarray
        x.dot(theta): 个数为m的一维矩阵
    """
    m = x.shape[0]
    return (x.dot(theta) - y).dot(x.dot(theta) - y) / (2*m)


def test():
    '''
    Test Function
    '''
    x, y = loadDataSet()
    x = np.array(x)
    y = np.array(y)
    m, n = x.shape
    theta = np.array([0]*n, dtype=np.float)
    costs = []
    for iters in range(1000):
        costs.append(cost_function(theta, x, y))
        theta = batch_gradient_descent(0.01, theta, x, y)
    print 'Batch-Gradient-Descent:', '\ncost:\n', costs
    print 'theta: ', theta
    print 'Hypothesis: ', h(theta, np.array([1.0, 5.4994])), '\n'

    print 'Normal-Equation:'
    theta = normal_equation(x, y)
    print 'theta: ', theta
    print 'Hypothesis: ', h(theta, np.array([1.0, 5.4994]))


if __name__=='__main__':
    test()

参考:

Kai Su /
Published under (CC) BY-NC-SA in categories Programming  tagged with python