diff --git a/TestHNSW/HNSWDemo/HNSWDemo.csproj b/TestHNSW/HNSWDemo/HNSWDemo.csproj
index 9b31502..b4e60da 100644
--- a/TestHNSW/HNSWDemo/HNSWDemo.csproj
+++ b/TestHNSW/HNSWDemo/HNSWDemo.csproj
@@ -5,6 +5,10 @@
net5.0
+
+
+
+
diff --git a/TestHNSW/HNSWDemo/Program.cs b/TestHNSW/HNSWDemo/Program.cs
index 2d946e5..57334a3 100644
--- a/TestHNSW/HNSWDemo/Program.cs
+++ b/TestHNSW/HNSWDemo/Program.cs
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
+using System.Drawing;
using System.IO;
using System.Linq;
using ZeroLevel.HNSW;
@@ -89,7 +90,7 @@ namespace HNSWDemo
{
var vector = new float[vectorSize];
DefaultRandomGenerator.Instance.NextFloats(vector);
- VectorUtils.NormalizeSIMD(vector);
+ //VectorUtils.NormalizeSIMD(vector);
vectors.Add(vector);
}
return vectors;
@@ -98,10 +99,53 @@ namespace HNSWDemo
static void Main(string[] args)
{
- TransformToCompactWorldTestWithAccuracity();
+ var vectors = RandomVectors(128, 3000);
+ var world = SmallWorld.CreateWorld(NSWOptions.Create(8, 16, 200, 200, Metrics.L2Euclidean, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple));
+ world.AddItems(vectors);
+ DrawHistogram(world, @"D:\hist.jpg");
+ Console.WriteLine("Completed");
Console.ReadKey();
}
+ static void DrawHistogram(SmallWorld world, string filename)
+ {
+ var histogram = world.GetHistogram();
+ /* while (histogram.CountSignChanges() > 3)
+ {
+ histogram.Smooth();
+ }*/
+ var wb = 1200 / histogram.Values.Length;
+ var k = 600.0f / (float)histogram.Values.Max();
+
+ var maxes = histogram.GetMaximums().ToDictionary(m => m.Index, m => m);
+ int threshold = histogram.OTSU();
+
+ using (var bmp = new Bitmap(1200, 600))
+ {
+ using (var g = Graphics.FromImage(bmp))
+ {
+ for (int i = 0; i < histogram.Values.Length; i++)
+ {
+ var height = (int)(histogram.Values[i] * k);
+ if (maxes.ContainsKey(i))
+ {
+ g.DrawRectangle(Pens.Red, i * wb, bmp.Height - height, wb, height);
+ g.DrawRectangle(Pens.Red, i * wb + 1, bmp.Height - height, wb - 1, height);
+ }
+ else
+ {
+ g.DrawRectangle(Pens.Blue, i * wb, bmp.Height - height, wb, height);
+ }
+ if (i == threshold)
+ {
+ g.DrawLine(Pens.Green, i * wb + wb / 2, 0, i * wb + wb / 2, bmp.Height);
+ }
+ }
+ }
+ bmp.Save(filename);
+ }
+ }
+
static void TransformToCompactWorldTest()
{
var count = 10000;
diff --git a/ZeroLevel.HNSW/Model/Histogram.cs b/ZeroLevel.HNSW/Model/Histogram.cs
new file mode 100644
index 0000000..8ab2a89
--- /dev/null
+++ b/ZeroLevel.HNSW/Model/Histogram.cs
@@ -0,0 +1,183 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace ZeroLevel.HNSW
+{
+ public class HistogramValue
+ {
+ public int Index { get; internal set; }
+ public int Value { get; internal set; }
+ public float MinBound { get; internal set; }
+ public float MaxBound { get; internal set; }
+ }
+
+ public class Histogram
+ {
+ public HistogramMode Mode { get; }
+ public float Min { get; }
+ public float Max { get; }
+ public float BoundsPeriod { get; }
+ public float[] Bounds { get; }
+ public int[] Values { get; }
+
+ internal Histogram(HistogramMode mode, IList data)
+ {
+ Mode = mode;
+ Min = data.Min();
+ Max = data.Max();
+ int M = mode == HistogramMode.LOG ? (int)(1f + 3.2f * Math.Log(data.Count)) : (int)(Math.Sqrt(data.Count));
+ BoundsPeriod = (Max - Min) / M;
+ Bounds = new float[M - 1];
+
+ float bound = Min + BoundsPeriod;
+ for (int i = 0; i < Bounds.Length; i++)
+ {
+ Bounds[i] = bound;
+ bound += BoundsPeriod;
+ }
+ Values = new int[M];
+ for (int i = 0; i < Values.Length; i++)
+ {
+ Values[i] = 0;
+ }
+ foreach (var v in data)
+ {
+ if (v < float.Epsilon) continue;
+ for (int i = 0; i < Bounds.Length; i++)
+ {
+ if (v < Bounds[i])
+ {
+ Values[i]++;
+ break;
+ }
+ }
+ }
+ }
+
+ public int Count => Values?.Length ?? 0;
+
+ public int CountSignChanges()
+ {
+ if ((Values?.Length ?? 0) <= 2) return 0;
+ int i = 0;
+ while (Values[i] <= float.Epsilon) { i++; continue; }
+ if ((Values.Length - i) <= 2) return 0;
+
+ var delta = Values[i + 1] - Values[i];
+ int changes = 0;
+ i++;
+ for (; i < Values.Length - 1; i++)
+ {
+ var d = Values[i + 1] - Values[i];
+ if (Math.Abs(d) <= float.Epsilon)
+ {
+ continue;
+ }
+ if (NumbersHasSameSign(d, delta) == false)
+ {
+ delta = d;
+ changes++;
+ }
+ }
+ return changes;
+ }
+
+ public void Smooth()
+ {
+ var buffer = new int[Values.Length];
+ Array.Copy(Values, buffer, buffer.Length);
+ for (int i = 2; i < Values.Length - 3; i++)
+ {
+ Values[i] = (buffer[i - 2] + buffer[i - 1] + buffer[i] + buffer[i + 1] + buffer[i + 2]) / 5;
+ }
+ }
+
+ public IEnumerable GetMaximums()
+ {
+ var list = new List();
+
+ if ((Values?.Length ?? 0) <= 2) return list;
+ int i = 0;
+ while (Values[i] <= float.Epsilon) { i++; continue; }
+ if ((Values.Length - i) <= 2) return list;
+
+ var delta = Values[i + 1] - Values[i];
+ i++;
+ for (; i < Values.Length - 1; i++)
+ {
+ var d = Values[i + 1] - Values[i];
+ if (Math.Abs(d) <= float.Epsilon)
+ {
+ continue;
+ }
+ if (NumbersHasSameSign(d, delta) == false)
+ {
+ if (delta > 0)
+ {
+ list.Add(new HistogramValue
+ {
+ Index = i,
+ Value = Values[i],
+ MinBound = Bounds[i - 1],
+ MaxBound = Bounds[i]
+ });
+ }
+ delta = d;
+ }
+ }
+ return list;
+ }
+
+ #region OTSU "https://en.wikipedia.org/wiki/Otsu's_method"
+ // function is used to compute the q values in the equation
+ private float Px(int init, int end)
+ {
+ int sum = 0;
+ int i;
+ for (i = init; i < end; i++)
+ sum += Values[i];
+ return (float)sum;
+ }
+ // function is used to compute the mean values in the equation (mu)
+ private float Mx(int init, int end)
+ {
+ int sum = 0;
+ int i;
+ for (i = init; i < end; i++)
+ sum += i * Values[i];
+
+ return (float)sum;
+ }
+
+ public int OTSU()
+ {
+ float p1, p2, p12;
+ int k;
+ int threshold = 0;
+ float bcv = 0;
+ for (k = 0; k < Values.Length; k++)
+ {
+ p1 = Px(0, k);
+ p2 = Px(k + 1, Values.Length);
+ p12 = p1 * p2;
+ if (p12 == 0)
+ p12 = 1;
+ float diff = (Mx(0, k) * p2) - (Mx(k + 1, Values.Length) * p1);
+ var test = (float)diff * diff / p12;
+ if (test > bcv)
+ {
+ bcv = test;
+ threshold = k;
+ }
+ }
+ return threshold;
+ }
+ #endregion
+
+ static bool NumbersHasSameSign(int left, int right)
+ {
+ return left >= 0 && right >= 0 || left < 0 && right < 0;
+ }
+ }
+}
diff --git a/ZeroLevel.HNSW/Model/HistogramMode.cs b/ZeroLevel.HNSW/Model/HistogramMode.cs
new file mode 100644
index 0000000..b897d0d
--- /dev/null
+++ b/ZeroLevel.HNSW/Model/HistogramMode.cs
@@ -0,0 +1,14 @@
+namespace ZeroLevel.HNSW
+{
+ public enum HistogramMode
+ {
+ ///
+ /// 1 + 3.2 * Ln(LinksCount)
+ ///
+ SQRT,
+ ///
+ /// Sqrt(LinksCount)
+ ///
+ LOG
+ }
+}
diff --git a/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs b/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs
index dd58909..ec6a021 100644
--- a/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs
+++ b/ZeroLevel.HNSW/Services/CompactBiDirectionalLinksSet.cs
@@ -254,6 +254,11 @@ namespace ZeroLevel.HNSW
}
}
+ public Histogram CalculateHistogram(HistogramMode mode)
+ {
+ return new Histogram(mode, _set.Values);
+ }
+
internal float Distance(int id1, int id2)
{
long k = (((long)(id1)) << HALF_LONG_BITS) + id2;
diff --git a/ZeroLevel.HNSW/Services/Layer.cs b/ZeroLevel.HNSW/Services/Layer.cs
index 713d6ea..8234241 100644
--- a/ZeroLevel.HNSW/Services/Layer.cs
+++ b/ZeroLevel.HNSW/Services/Layer.cs
@@ -486,5 +486,7 @@ namespace ZeroLevel.HNSW
{
_links.Deserialize(reader);
}
+
+ internal Histogram GetHistogram(HistogramMode mode) => _links.CalculateHistogram(mode);
}
}
\ No newline at end of file
diff --git a/ZeroLevel.HNSW/SmallWorld.cs b/ZeroLevel.HNSW/SmallWorld.cs
index 9101f81..80cf216 100644
--- a/ZeroLevel.HNSW/SmallWorld.cs
+++ b/ZeroLevel.HNSW/SmallWorld.cs
@@ -343,5 +343,8 @@ namespace ZeroLevel.HNSW
}
}
}
+
+ public Histogram GetHistogram(HistogramMode mode = HistogramMode.SQRT)
+ => _layers[0].GetHistogram(mode);
}
}
diff --git a/ZeroLevel.HNSW/Utils/EuclidDistance.cs b/ZeroLevel.HNSW/Utils/EuclidDistance.cs
index 1fa0211..fe79ee3 100644
--- a/ZeroLevel.HNSW/Utils/EuclidDistance.cs
+++ b/ZeroLevel.HNSW/Utils/EuclidDistance.cs
@@ -1,6 +1,6 @@
using System;
-namespace ZeroLevel.HNSW.Utils
+namespace ZeroLevel.HNSW
{
public static class Metrics
{
diff --git a/ZeroLevel.HNSW/Utils/ProbabilityLayerNumberGenerator.cs b/ZeroLevel.HNSW/Utils/ProbabilityLayerNumberGenerator.cs
index f37b664..89a4a7c 100644
--- a/ZeroLevel.HNSW/Utils/ProbabilityLayerNumberGenerator.cs
+++ b/ZeroLevel.HNSW/Utils/ProbabilityLayerNumberGenerator.cs
@@ -8,7 +8,6 @@ namespace ZeroLevel.HNSW.Services
internal ProbabilityLayerNumberGenerator(int maxLayers, int M)
{
- _mL = maxLayers;
_probabilities = new float[maxLayers];
var m_L = 1.0f / Math.Log(M);
for (int i = 0; i < maxLayers; i++)