From 33e20488627bc47541cf1a260fac13da53bc89cd Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 13 Dec 2021 02:09:42 +0300 Subject: [PATCH] HNSM Append histogram OTSU method for histogram - detect bound between intra-cluster distance and out-of-cluster distance --- TestHNSW/HNSWDemo/HNSWDemo.csproj | 4 + TestHNSW/HNSWDemo/Program.cs | 48 ++++- ZeroLevel.HNSW/Model/Histogram.cs | 183 ++++++++++++++++++ ZeroLevel.HNSW/Model/HistogramMode.cs | 14 ++ .../Services/CompactBiDirectionalLinksSet.cs | 5 + ZeroLevel.HNSW/Services/Layer.cs | 2 + ZeroLevel.HNSW/SmallWorld.cs | 3 + ZeroLevel.HNSW/Utils/EuclidDistance.cs | 2 +- .../Utils/ProbabilityLayerNumberGenerator.cs | 1 - 9 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 ZeroLevel.HNSW/Model/Histogram.cs create mode 100644 ZeroLevel.HNSW/Model/HistogramMode.cs diff --git a/TestHNSW/HNSWDemo/HNSWDemo.csproj b/TestHNSW/HNSWDemo/HNSWDemo.csproj index 9b31502..b4e60da 100644 --- a/TestHNSW/HNSWDemo/HNSWDemo.csproj +++ b/TestHNSW/HNSWDemo/HNSWDemo.csproj @@ -5,6 +5,10 @@ net5.0 + + + + diff --git a/TestHNSW/HNSWDemo/Program.cs b/TestHNSW/HNSWDemo/Program.cs index 2d946e5..57334a3 100644 --- a/TestHNSW/HNSWDemo/Program.cs +++ b/TestHNSW/HNSWDemo/Program.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Diagnostics; +using System.Drawing; using System.IO; using System.Linq; using ZeroLevel.HNSW; @@ -89,7 +90,7 @@ namespace HNSWDemo { var vector = new float[vectorSize]; DefaultRandomGenerator.Instance.NextFloats(vector); - VectorUtils.NormalizeSIMD(vector); + //VectorUtils.NormalizeSIMD(vector); vectors.Add(vector); } return vectors; @@ -98,10 +99,53 @@ namespace HNSWDemo static void Main(string[] args) { - TransformToCompactWorldTestWithAccuracity(); + var vectors = RandomVectors(128, 3000); + var world = SmallWorld.CreateWorld(NSWOptions.Create(8, 16, 200, 200, Metrics.L2Euclidean, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple)); + world.AddItems(vectors); + DrawHistogram(world, @"D:\hist.jpg"); + Console.WriteLine("Completed"); Console.ReadKey(); } + static void DrawHistogram(SmallWorld world, string filename) + { + var histogram = world.GetHistogram(); + /* while (histogram.CountSignChanges() > 3) + { + histogram.Smooth(); + }*/ + var wb = 1200 / histogram.Values.Length; + var k = 600.0f / (float)histogram.Values.Max(); + + var maxes = histogram.GetMaximums().ToDictionary(m => m.Index, m => m); + int threshold = histogram.OTSU(); + + using (var bmp = new Bitmap(1200, 600)) + { + using (var g = Graphics.FromImage(bmp)) + { + for (int i = 0; i < histogram.Values.Length; i++) + { + var height = (int)(histogram.Values[i] * k); + if (maxes.ContainsKey(i)) + { + g.DrawRectangle(Pens.Red, i * wb, bmp.Height - height, wb, height); + g.DrawRectangle(Pens.Red, i * wb + 1, bmp.Height - height, wb - 1, height); + } + else + { + g.DrawRectangle(Pens.Blue, i * wb, bmp.Height - height, wb, height); + } + if (i == threshold) + { + g.DrawLine(Pens.Green, i * wb + wb / 2, 0, i * wb + wb / 2, bmp.Height); + } + } + } + bmp.Save(filename); + } + } + static void TransformToCompactWorldTest() { var count = 10000; diff --git a/ZeroLevel.HNSW/Model/Histogram.cs b/ZeroLevel.HNSW/Model/Histogram.cs new file mode 100644 index 0000000..8ab2a89 --- /dev/null +++ b/ZeroLevel.HNSW/Model/Histogram.cs @@ -0,0 +1,183 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace ZeroLevel.HNSW +{ + public class HistogramValue + { + public int Index { get; internal set; } + public int Value { get; internal set; } + public float MinBound { get; internal set; } + public float MaxBound { get; internal set; } + } + + public class Histogram + { + public HistogramMode Mode { get; } + public float Min { get; } + public float Max { get; } + public float BoundsPeriod { get; } + public float[] Bounds { get; } + public int[] Values { get; } + + internal Histogram(HistogramMode mode, IList data) + { + Mode = mode; + Min = data.Min(); + Max = data.Max(); + int M = mode == HistogramMode.LOG ? (int)(1f + 3.2f * Math.Log(data.Count)) : (int)(Math.Sqrt(data.Count)); + BoundsPeriod = (Max - Min) / M; + Bounds = new float[M - 1]; + + float bound = Min + BoundsPeriod; + for (int i = 0; i < Bounds.Length; i++) + { + Bounds[i] = bound; + bound += BoundsPeriod; + } + Values = new int[M]; + for (int i = 0; i < Values.Length; i++) + { + Values[i] = 0; + } + foreach (var v in data) + { + if (v < float.Epsilon) continue; + for (int i = 0; i < Bounds.Length; i++) + { + if (v < Bounds[i]) + { + Values[i]++; + break; + } + } + } + } + + public int Count => Values?.Length ?? 0; + + public int CountSignChanges() + { + if ((Values?.Length ?? 0) <= 2) return 0; + int i = 0; + while (Values[i] <= float.Epsilon) { i++; continue; } + if ((Values.Length - i) <= 2) return 0; + + var delta = Values[i + 1] - Values[i]; + int changes = 0; + i++; + for (; i < Values.Length - 1; i++) + { + var d = Values[i + 1] - Values[i]; + if (Math.Abs(d) <= float.Epsilon) + { + continue; + } + if (NumbersHasSameSign(d, delta) == false) + { + delta = d; + changes++; + } + } + return changes; + } + + public void Smooth() + { + var buffer = new int[Values.Length]; + Array.Copy(Values, buffer, buffer.Length); + for (int i = 2; i < Values.Length - 3; i++) + { + Values[i] = (buffer[i - 2] + buffer[i - 1] + buffer[i] + buffer[i + 1] + buffer[i + 2]) / 5; + } + } + + public IEnumerable GetMaximums() + { + var list = new List(); + + if ((Values?.Length ?? 0) <= 2) return list; + int i = 0; + while (Values[i] <= float.Epsilon) { i++; continue; } + if ((Values.Length - i) <= 2) return list; + + var delta = Values[i + 1] - Values[i]; + i++; + for (; i < Values.Length - 1; i++) + { + var d = Values[i + 1] - Values[i]; + if (Math.Abs(d) <= float.Epsilon) + { + continue; + } + if (NumbersHasSameSign(d, delta) == false) + { + if (delta > 0) + { + list.Add(new HistogramValue + { + Index = i, + Value = Values[i], + MinBound = Bounds[i - 1], + MaxBound = Bounds[i] + }); + } + delta = d; + } + } + return list; + } + + #region OTSU "https://en.wikipedia.org/wiki/Otsu's_method" + // function is used to compute the q values in the equation + private float Px(int init, int end) + { + int sum = 0; + int i; + for (i = init; i < end; i++) + sum += Values[i]; + return (float)sum; + } + // function is used to compute the mean values in the equation (mu) + private float Mx(int init, int end) + { + int sum = 0; + int i; + for (i = init; i < end; i++) + sum += i * Values[i]; + + return (float)sum; + } + + public int OTSU() + { + float p1, p2, p12; + int k; + int threshold = 0; + float bcv = 0; + for (k = 0; k < Values.Length; k++) + { + p1 = Px(0, k); + p2 = Px(k + 1, Values.Length); + p12 = p1 * p2; + if (p12 == 0) + p12 = 1; + float diff = (Mx(0, k) * p2) - (Mx(k + 1, Values.Length) * p1); + var test = (float)diff * diff / p12; + if (test > bcv) + { + bcv = test; + threshold = k; + } + } + return threshold; + } + #endregion + + static bool NumbersHasSameSign(int left, int right) + { + return left >= 0 && right >= 0 || left < 0 && right < 0; + } + } +} diff --git a/ZeroLevel.HNSW/Model/HistogramMode.cs b/ZeroLevel.HNSW/Model/HistogramMode.cs new file mode 100644 index 0000000..b897d0d --- /dev/null +++ b/ZeroLevel.HNSW/Model/HistogramMode.cs @@ -0,0 +1,14 @@ +namespace ZeroLevel.HNSW +{ + public enum HistogramMode + { + /// + /// 1 + 3.2 * Ln(LinksCount) + /// + SQRT, + /// + /// Sqrt(LinksCount) + /// + LOG + } +} diff --git a/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs b/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs index dd58909..ec6a021 100644 --- a/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs +++ b/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs @@ -254,6 +254,11 @@ namespace ZeroLevel.HNSW } } + public Histogram CalculateHistogram(HistogramMode mode) + { + return new Histogram(mode, _set.Values); + } + internal float Distance(int id1, int id2) { long k = (((long)(id1)) << HALF_LONG_BITS) + id2; diff --git a/ZeroLevel.HNSW/Services/Layer.cs b/ZeroLevel.HNSW/Services/Layer.cs index 713d6ea..8234241 100644 --- a/ZeroLevel.HNSW/Services/Layer.cs +++ b/ZeroLevel.HNSW/Services/Layer.cs @@ -486,5 +486,7 @@ namespace ZeroLevel.HNSW { _links.Deserialize(reader); } + + internal Histogram GetHistogram(HistogramMode mode) => _links.CalculateHistogram(mode); } } \ No newline at end of file diff --git a/ZeroLevel.HNSW/SmallWorld.cs b/ZeroLevel.HNSW/SmallWorld.cs index 9101f81..80cf216 100644 --- a/ZeroLevel.HNSW/SmallWorld.cs +++ b/ZeroLevel.HNSW/SmallWorld.cs @@ -343,5 +343,8 @@ namespace ZeroLevel.HNSW } } } + + public Histogram GetHistogram(HistogramMode mode = HistogramMode.SQRT) + => _layers[0].GetHistogram(mode); } } diff --git a/ZeroLevel.HNSW/Utils/EuclidDistance.cs b/ZeroLevel.HNSW/Utils/EuclidDistance.cs index 1fa0211..fe79ee3 100644 --- a/ZeroLevel.HNSW/Utils/EuclidDistance.cs +++ b/ZeroLevel.HNSW/Utils/EuclidDistance.cs @@ -1,6 +1,6 @@ using System; -namespace ZeroLevel.HNSW.Utils +namespace ZeroLevel.HNSW { public static class Metrics { diff --git a/ZeroLevel.HNSW/Utils/ProbabilityLayerNumberGenerator.cs b/ZeroLevel.HNSW/Utils/ProbabilityLayerNumberGenerator.cs index f37b664..89a4a7c 100644 --- a/ZeroLevel.HNSW/Utils/ProbabilityLayerNumberGenerator.cs +++ b/ZeroLevel.HNSW/Utils/ProbabilityLayerNumberGenerator.cs @@ -8,7 +8,6 @@ namespace ZeroLevel.HNSW.Services internal ProbabilityLayerNumberGenerator(int maxLayers, int M) { - _mL = maxLayers; _probabilities = new float[maxLayers]; var m_L = 1.0f / Math.Log(M); for (int i = 0; i < maxLayers; i++)