From 2d2616ddcea212681ca16440e90eb6a037cacb73 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 13 Dec 2021 03:14:16 +0300 Subject: [PATCH] HNSW Add AutomaticGraphClusterer --- TestHNSW/HNSWDemo/Program.cs | 33 ++++++++-- .../Services/AutomaticGraphClusterer.cs | 64 +++++++++++++++++++ .../Services/CompactBiDirectionalLinksSet.cs | 2 + ZeroLevel.HNSW/Services/Layer.cs | 1 + ZeroLevel.HNSW/SmallWorld.cs | 1 + 5 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 ZeroLevel.HNSW/Services/AutomaticGraphClusterer.cs diff --git a/TestHNSW/HNSWDemo/Program.cs b/TestHNSW/HNSWDemo/Program.cs index 57334a3..a122c12 100644 --- a/TestHNSW/HNSWDemo/Program.cs +++ b/TestHNSW/HNSWDemo/Program.cs @@ -5,6 +5,7 @@ using System.Drawing; using System.IO; using System.Linq; using ZeroLevel.HNSW; +using ZeroLevel.HNSW.Services; namespace HNSWDemo { @@ -98,18 +99,42 @@ namespace HNSWDemo static void Main(string[] args) + { + AutoClusteringTest(); + Console.WriteLine("Completed"); + Console.ReadKey(); + } + + static void AutoClusteringTest() { var vectors = RandomVectors(128, 3000); var world = SmallWorld.CreateWorld(NSWOptions.Create(8, 16, 200, 200, Metrics.L2Euclidean, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple)); world.AddItems(vectors); - DrawHistogram(world, @"D:\hist.jpg"); - Console.WriteLine("Completed"); - Console.ReadKey(); + var clusters = AutomaticGraphClusterer.DetectClusters(world); + Console.WriteLine($"Found {clusters.Count} clusters"); + for (int i = 0; i < clusters.Count; i++) + { + Console.WriteLine($"Cluster {i+1} countains {clusters[i].Count} items"); + } } - static void DrawHistogram(SmallWorld world, string filename) + static void HistogramTest() { + var vectors = RandomVectors(128, 3000); + var world = SmallWorld.CreateWorld(NSWOptions.Create(8, 16, 200, 200, Metrics.L2Euclidean, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple)); + world.AddItems(vectors); var histogram = world.GetHistogram(); + + int threshold = histogram.OTSU(); + var min = histogram.Bounds[threshold - 1]; + var max = histogram.Bounds[threshold]; + var R = (max + min) / 2; + + DrawHistogram(histogram, @"D:\hist.jpg"); + } + + static void DrawHistogram(Histogram histogram, string filename) + { /* while (histogram.CountSignChanges() > 3) { histogram.Smooth(); diff --git a/ZeroLevel.HNSW/Services/AutomaticGraphClusterer.cs b/ZeroLevel.HNSW/Services/AutomaticGraphClusterer.cs new file mode 100644 index 0000000..60177ef --- /dev/null +++ b/ZeroLevel.HNSW/Services/AutomaticGraphClusterer.cs @@ -0,0 +1,64 @@ +using System.Collections.Generic; + +namespace ZeroLevel.HNSW.Services +{ + public static class AutomaticGraphClusterer + { + private const int HALF_LONG_BITS = 32; + + public static List> DetectClusters(SmallWorld world) + { + var links = world.GetNSWLinks(); + // 1. Find R - bound between intra-cluster distances and out-of-cluster distances + var histogram = new Histogram(HistogramMode.SQRT, links.Values); + int threshold = histogram.OTSU(); + var min = histogram.Bounds[threshold - 1]; + var max = histogram.Bounds[threshold]; + var R = (max + min) / 2; + + // 2. Get links with distances less than R + var resultLinks = new SortedList(); + foreach (var pair in links) + { + if (pair.Value < R) + { + resultLinks.Add(pair.Key, pair.Value); + } + } + + // 3. Extract clusters + List> clusters = new List>(); + foreach (var pair in resultLinks) + { + var k = pair.Key; + var id1 = (int)(k >> HALF_LONG_BITS); + var id2 = (int)(k - (((long)id1) << HALF_LONG_BITS)); + + bool found = false; + foreach (var c in clusters) + { + if (c.Contains(id1)) + { + c.Add(id2); + found = true; + break; + } + else if (c.Contains(id2)) + { + c.Add(id1); + found = true; + break; + } + } + if (found == false) + { + var c = new HashSet(); + c.Add(id1); + c.Add(id2); + clusters.Add(c); + } + } + return clusters; + } + } +} diff --git a/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs b/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs index ec6a021..de44419 100644 --- a/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs +++ b/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs @@ -15,6 +15,8 @@ namespace ZeroLevel.HNSW private SortedList _set = new SortedList(); + internal SortedList Links => _set; + internal (int, int) this[int index] { get diff --git a/ZeroLevel.HNSW/Services/Layer.cs b/ZeroLevel.HNSW/Services/Layer.cs index 8234241..ace3a65 100644 --- a/ZeroLevel.HNSW/Services/Layer.cs +++ b/ZeroLevel.HNSW/Services/Layer.cs @@ -14,6 +14,7 @@ namespace ZeroLevel.HNSW private readonly NSWOptions _options; private readonly VectorSet _vectors; private readonly CompactBiDirectionalLinksSet _links; + internal SortedList Links => _links.Links; /// /// There are links е the layer diff --git a/ZeroLevel.HNSW/SmallWorld.cs b/ZeroLevel.HNSW/SmallWorld.cs index 80cf216..26c9606 100644 --- a/ZeroLevel.HNSW/SmallWorld.cs +++ b/ZeroLevel.HNSW/SmallWorld.cs @@ -17,6 +17,7 @@ namespace ZeroLevel.HNSW private int MaxLayer = 0; private readonly ProbabilityLayerNumberGenerator _layerLevelGenerator; private ReaderWriterLockSlim _lockGraph = new ReaderWriterLockSlim(); + internal SortedList GetNSWLinks() => _layers[0].Links; public SmallWorld(NSWOptions options) {