## K-Means clustering algorithm in F#

This is an implementation of the K-Means clustering algorithm written in F#. Given a set of points and an integer `K`

, the goal is to group the points into `K`

clusters.

Each point is represented as a vector, and I represent each vector as a list of numbers, one for each coordinate. I define some operations for these vectors that will be needed for the actual algorithm.

```
open System
let euclideanDistanceSquared v1 v2 =
List.zip v1 v2
|> List.map (fun (x1, x2) ->
let diff = (x1 - x2)
diff * diff)
|> List.sum
let vecAdd v1 v2 =
List.zip v1 v2
|> List.map (fun (x1, x2) -> x1 + x2)
let vecSum vs =
vs
|> List.reduce vecAdd
let vecScale a v =
v |> List.map ((*) a)
let vecAverage (vs : float list list) =
let length = List.length vs |> float
vecScale (1.0 / length) (vecSum vs)
```

Next, I define a couple of helper functions, so that I can pick a random selection of elements from a list without replacement. This function will be used for the original assignment of the points to clusters.

```
let splitAt n lst =
let rec loop n revAcc lst =
match lst with
| [] -> (List.rev revAcc, [])
| _ when n = 0 -> (List.rev revAcc, lst)
| x::xs -> loop (n - 1) (x::revAcc) xs
loop n [] lst
let randomSampleWithoutReplacement (rnd : Random) n lst =
let length = List.length lst
let rec loop lst n len acc =
match lst, n with
| [], _ -> acc
| _, n when n <= 0 -> acc
| _ ->
let i = rnd.Next(0, len)
let part1, part2 = splitAt i lst
let hd = List.head part2
let tl = List.tail part2
let newAcc = hd::acc
let rest = part1 @ tl
loop rest (n-1) (len-1) newAcc
loop lst n length []
```

I am going to use this function to pick `n`

random points as the initial centroids. The method above is efficient for small `n`

, which is usually the case here, as the number of centroids is usually much smaller than the total number of points.

An alternative way of picking a random sample without replacement is zipping all the points with a random number, then order by the random number and finally take the first `n`

points.

```
let randomSampleWithoutReplacement' (rnd: Random) n lst =
lst
|> List.map (fun x -> (rnd.Next(), x))
|> List.sortBy (fun (r, x) -> r)
|> Seq.take (max (min (List.length lst) n) 0)
|> Seq.map (fun (r, x) -> x)
|> Seq.toList
```

The code below is the actual implementation of the `K-Means`

algorithm and more specifically I perform the algorithm several times, with different initial clusters, and keep the best result.

```
let findClusters (rnd : Random) n tries points =
let closestCendroid centroids x =
centroids
|> List.mapi (fun i centroid -> (i, euclideanDistanceSquared x centroid))
|> List.minBy snd
let assignToClusters centroids =
let pointsWithClusterIdAndCost =
points
|> List.mapi (fun i p -> ((i, p), closestCendroid centroids p))
let totalCost =
pointsWithClusterIdAndCost
|> List.sumBy (fun (_, (clusterId, cost)) -> cost)
let clusters =
pointsWithClusterIdAndCost
|> Seq.groupBy (fun (_, (clusterId, cost)) -> clusterId)
|> Seq.sortBy fst
let clusteredIds =
clusters
|> Seq.map (fun (clusterId, items) ->
items
|> Seq.map (fun ((i, p), _) -> i)
|> Seq.toList)
|> Seq.toList
let clusteredPoints =
clusters
|> Seq.map (fun (clusterId, items) ->
items
|> Seq.map (fun ((i, p), _) -> p)
|> Seq.toList)
|> Seq.toList
(totalCost, clusteredIds, clusteredPoints)
let updateCentroids clusteredPoints =
clusteredPoints
|> List.map vecAverage
let rec singleTry centroids previousClusteredIds =
let cost, clusteredIds, clusteredPoints = assignToClusters centroids
if (clusteredIds = previousClusteredIds)
then
cost, clusteredIds, clusteredPoints
else
let newCentroids = updateCentroids clusteredPoints
singleTry newCentroids clusteredIds
let rec tryMany times bestCost bestClusteredIds bestClusteredPoints =
if times <= 0
then bestClusteredIds, bestClusteredPoints
else
let startingCentroids = randomSampleWithoutReplacement rnd n points
let cost, clusteredIds, clusteredPoints = singleTry startingCentroids []
if cost < bestCost
then tryMany (times - 1) cost clusteredIds clusteredPoints
else tryMany (times - 1) bestCost bestClusteredIds bestClusteredPoints
tryMany tries Double.MaxValue [] []
```

The algorithm returns the indices of the points passed in clustered into `K`

clusters, and separately the points themselves clustered into `K`

clusters. Both of them are a list of `K`

lists, one for each cluster. With small modifications we could also return the centroids for the final answer.

I test the algorithm by creating some synthetic data. I generate 40 points clustered around 4 centroids, 10 points for each cluster.

```
let seed = 38970
let rnd = Random(seed)
let randomPoint center noise dimensions =
let noiseVector =
[1 .. dimensions]
|> List.map (fun _ -> (2.0 * rnd.NextDouble() - 1.0) * noise)
vecAdd center noiseVector
let data =
[ ([-1.0; -1.0], 0.45)
([-1.0; 1.0], 0.40)
([ 1.0; -1.0], 0.35)
([ 1.0; 1.0], 0.50) ]
|> List.map (fun (c, n) ->
[1 .. 10]
|> List.map (fun _ -> randomPoint c n 2))
|> List.collect id
let nTries = 1
let nClusters = 4
let clusteredIds, clusteredPoints =
data |> findClusters rnd nClusters nTries
clusteredIds
|> List.iter(fun items ->
items
|> List.iter (printf "%d ")
printfn "")
```

If I run the algorithm with `nTries = 1`

I can get a solution that is not optimal, and it is possible that the result looks like this

If I set `nTries`

to a higher number, the algorithm will try several times, and give back the best solution. This way there are less chances to get a solution like the one above. The solution I get for `nTries = 3`

is shown below.