K-Means clustering algorithm in F#

This is an implementation of the K-Means clustering algorithm written in F#. Given a set of points and an integer K, the goal is to group the points into K clusters.

Each point is represented as a vector, and I represent each vector as a list of numbers, one for each coordinate. I define some operations for these vectors that will be needed for the actual algorithm.

open System

let euclideanDistanceSquared v1 v2 = 
    List.zip v1 v2
    |> List.map (fun (x1, x2) -> 
        let diff = (x1 - x2)
        diff * diff)
    |> List.sum

let vecAdd v1 v2 =
    List.zip v1 v2
    |> List.map (fun (x1, x2) -> x1 + x2)

let vecSum vs =
    vs
    |> List.reduce vecAdd

let vecScale a v =
    v |> List.map ((*) a)

let vecAverage (vs : float list list) =
    let length = List.length vs |> float
    vecScale (1.0 / length) (vecSum vs)

Next, I define a couple of helper functions, so that I can pick a random selection of elements from a list without replacement. This function will be used for the original assignment of the points to clusters.

let splitAt n lst =
    let rec loop n revAcc lst =
        match lst with
        | [] -> (List.rev revAcc, [])
        | _ when n = 0 -> (List.rev revAcc, lst)
        | x::xs -> loop (n - 1) (x::revAcc) xs
    loop n [] lst 

let randomSampleWithoutReplacement (rnd : Random) n lst =
    let length = List.length lst
    let rec loop lst n len acc =
        match lst, n with
            | [], _ -> acc
            | _, n when n <= 0 -> acc
            | _ ->
                let i = rnd.Next(0, len)
                let part1, part2 = splitAt i lst
                let hd = List.head part2
                let tl = List.tail part2
                let newAcc = hd::acc
                let rest = part1 @ tl
                loop rest (n-1) (len-1) newAcc
    loop lst n length []

I am going to use this function to pick n random points as the initial centroids. The method above is efficient for small n, which is usually the case here, as the number of centroids is usually much smaller than the total number of points.

An alternative way of picking a random sample without replacement is zipping all the points with a random number, then order by the random number and finally take the first n points.

let randomSampleWithoutReplacement' (rnd: Random) n lst =
    lst 
    |> List.map (fun x -> (rnd.Next(), x)) 
    |> List.sortBy (fun (r, x) -> r)
    |> Seq.take (max (min (List.length lst) n) 0)
    |> Seq.map (fun (r, x) -> x)
    |> Seq.toList

The code below is the actual implementation of the K-Means algorithm and more specifically I perform the algorithm several times, with different initial clusters, and keep the best result.

let findClusters (rnd : Random) n tries points =

    let closestCendroid centroids x =
        centroids
        |> List.mapi (fun i centroid -> (i, euclideanDistanceSquared x centroid))
        |> List.minBy snd

    let assignToClusters centroids =
        let pointsWithClusterIdAndCost =
            points
            |> List.mapi (fun i p -> ((i, p), closestCendroid centroids p)) 
        let totalCost = 
            pointsWithClusterIdAndCost
            |> List.sumBy (fun (_, (clusterId, cost)) -> cost)
        let clusters =
            pointsWithClusterIdAndCost
            |> Seq.groupBy (fun (_, (clusterId, cost)) -> clusterId)
            |> Seq.sortBy fst
        let clusteredIds =
            clusters
            |> Seq.map (fun (clusterId, items) ->
                items
                |> Seq.map (fun ((i, p), _) -> i)
                |> Seq.toList)
            |> Seq.toList
        let clusteredPoints =
            clusters
            |> Seq.map (fun (clusterId, items) ->
                items
                |> Seq.map (fun ((i, p), _) -> p)
                |> Seq.toList)
            |> Seq.toList
         
        (totalCost, clusteredIds, clusteredPoints)
            
    let updateCentroids clusteredPoints = 
        clusteredPoints 
        |> List.map vecAverage

    let rec singleTry centroids previousClusteredIds =
        let cost, clusteredIds, clusteredPoints = assignToClusters centroids
        if (clusteredIds = previousClusteredIds)
        then
            cost, clusteredIds, clusteredPoints
        else 
            let newCentroids = updateCentroids clusteredPoints
            singleTry newCentroids clusteredIds

    let rec tryMany times bestCost bestClusteredIds bestClusteredPoints =
        if times <= 0
        then bestClusteredIds, bestClusteredPoints
        else
            let startingCentroids = randomSampleWithoutReplacement rnd n points
            let cost, clusteredIds, clusteredPoints = singleTry startingCentroids [] 
            if cost < bestCost
            then tryMany (times - 1) cost clusteredIds clusteredPoints
            else tryMany (times - 1) bestCost bestClusteredIds bestClusteredPoints

    tryMany tries Double.MaxValue [] []

The algorithm returns the indices of the points passed in clustered into K clusters, and separately the points themselves clustered into K clusters. Both of them are a list of K lists, one for each cluster. With small modifications we could also return the centroids for the final answer.

I test the algorithm by creating some synthetic data. I generate 40 points clustered around 4 centroids, 10 points for each cluster.

    let seed = 38970
    let rnd = Random(seed)

    let randomPoint center noise dimensions =
        let noiseVector =
            [1 .. dimensions]
            |> List.map (fun _ -> (2.0 * rnd.NextDouble() - 1.0) * noise)
        vecAdd center noiseVector

    let data =
        [ ([-1.0; -1.0], 0.45)
          ([-1.0;  1.0], 0.40)
          ([ 1.0; -1.0], 0.35)
          ([ 1.0;  1.0], 0.50) ]
        |> List.map (fun (c, n) -> 
            [1 .. 10]
            |> List.map (fun _ -> randomPoint c n 2))
        |> List.collect id

    let nTries = 1
    let nClusters = 4

    let clusteredIds, clusteredPoints = 
        data |> findClusters rnd nClusters nTries

    clusteredIds
    |> List.iter(fun items ->
        items
        |> List.iter (printf "%d ")
        printfn "")

If I run the algorithm with nTries = 1 I can get a solution that is not optimal, and it is possible that the result looks like this

If I set nTries to a higher number, the algorithm will try several times, and give back the best solution. This way there are less chances to get a solution like the one above. The solution I get for nTries = 3 is shown below.

 

comments powered by Disqus