I received requirements from a customer that said something about Least-Squares Regression something or another. Having not looked at anything but the most elementary statistics since around 2001, I realized that I needed to refresh myself.
I did some background reading and played with LINEST in Excel (that is a whole ball of insanity : http://support.microsoft.com/kb/828533 ). Tonight I decided to implement something in C#. Even if I don’t use it for this particular client, I will have a better understanding of what I am doing.
First, I asked my wife for help. As a biologist and geneticist she is more trained in statistics than I am. She didn’t know off the top of her head, but she pulled out her book which I then dove into.
I had the crazy idea of testing my implementation using the examples straight out of Introduction to the Practice of Statistics.
I started writing Test First.
[TestFixture]
public class LeastSquaresTests
{
readonly double[] nonExerciseActivityIncrease = { -94, -57, -29, 135, 143, 151, 245,
355, 392, 473, 486, 535, 571, 580, 620, 690 };
readonly double[] fatGain = { 4.2, 3.0, 3.7, 2.7, 3.2, 3.6, 2.4,
1.3, 3.8, 1.7, 1.6, 2.2, 1.0, 0.4, 2.3, 1.1 };
private IEnumerable<Point> getPoints()
{
return from i in Enumerable.Range(0, nonExerciseActivityIncrease.Length )
select new Point(nonExerciseActivityIncrease[i], fatGain[i]);
}
[Test]
public void XYTest()
{
var result = getPoints().GetLeastSquares();
Assert.AreEqual(324.8, nonExerciseActivityIncrease.Average(), 0.1);
Assert.AreEqual(324.8, result.Xmean, 0.1);
Assert.AreEqual(257.66, result.XstdDev, 0.01);
Assert.AreEqual(2.388, result.Ymean, 0.001);
Assert.AreEqual(1.1389, result.YstdDev, 0.0001);
Assert.AreEqual(-0.7786, result.Correlation, 0.0001);
Assert.AreEqual(-0.00344, result.Slope, 0.00001);
Assert.AreEqual(3.505, result.Intercept, 0.001);
}
}
My code wouldn’t compile. This is what Test First is all about. Then I implemented things.
public static class LeastSquares
{
public static LeastSquaresResponse GetLeastSquares(this IEnumerable<System.Windows.Point> points)
{
var xMean = (from p in points select p.X).Average();
var yMean = (from p in points select p.Y).Average();
var xVariance = (from p in points select Math.Pow(p.X – xMean, 2.0)).Sum() / (points.Count() – 1);
var yVariance = (from p in points select Math.Pow(p.Y – yMean, 2.0)).Sum() / (points.Count() – 1);
var xStdDev = Math.Sqrt(xVariance);
var yStdDev = Math.Sqrt(yVariance);
var correlation = (from p in points select ((p.X – xMean) / xStdDev) * ((p.Y – yMean) / yStdDev)).Sum() / (points.Count() – 1);
var slope = correlation * yStdDev / xStdDev;
var intercept = yMean – slope * xMean;
return new LeastSquaresResponse(xMean,
yMean,
xVariance,
yVariance,
xStdDev,
yStdDev,
correlation,
slope,
intercept);
}
}
I find the LINQ syntax to be very readable for these types of math operations. I definitely like aggregation better than using for-loops.
I declared a type LeastSquaresResponse which returns everything that is calculated when Least-Squares is computed. This is really just for testing sake. I wanted to be able to test all the values.
For completeness, I’ve incluced the definition of LeastSquaresResponse. Maybe next year I will do Analysis of Variance (ANOVA), but probably not.
public class LeastSquaresResponse
{
private readonly double xmean;
public double Xmean
{
get { return xmean; }
}
private readonly double ymean;
public double Ymean
{
get { return ymean; }
}
private readonly double xvariance;
public double Xvariance
{
get { return xvariance; }
}
private readonly double yvariance;
public double Yvariance
{
get { return yvariance; }
}
private readonly double xstdDev;
public double XstdDev
{
get { return xstdDev; }
}
private readonly double ystdDev;
public double YstdDev
{
get { return ystdDev; }
}
private readonly double correlation;
public double Correlation
{
get { return correlation; }
}
private readonly double slope;
public double Slope
{
get { return slope; }
}
private readonly double intercept;
public double Intercept
{
get { return intercept; }
}
/// <summary>
/// Initializes a new instance of the LeastSquaresResponse class.
/// </summary>
/// <param name=”xmean”></param>
/// <param name=”ymean”></param>
/// <param name=”xvariance”></param>
/// <param name=”yvariance”></param>
/// <param name=”xstdDev”></param>
/// <param name=”ystdDev”></param>
/// <param name=”correlation”></param>
/// <param name=”slope”></param>
/// <param name=”intercept”></param>
public LeastSquaresResponse(double xmean, double ymean, double xvariance, double yvariance, double xstdDev, double ystdDev, double correlation, double slope, double intercept)
{
this.xmean = xmean;
this.ymean = ymean;
this.xvariance = xvariance;
this.yvariance = yvariance;
this.xstdDev = xstdDev;
this.ystdDev = ystdDev;
this.correlation = correlation;
this.slope = slope;
this.intercept = intercept;
}
}