Append histogram
OTSU method for histogram - detect bound between intra-cluster distance and out-of-cluster distance
pull/1/head
unknown 3 years ago
parent d8095157af
commit 33e2048862

@ -5,6 +5,10 @@
<TargetFramework>net5.0</TargetFramework> <TargetFramework>net5.0</TargetFramework>
</PropertyGroup> </PropertyGroup>
<ItemGroup>
<PackageReference Include="System.Drawing.Common" Version="6.0.0" />
</ItemGroup>
<ItemGroup> <ItemGroup>
<ProjectReference Include="..\..\ZeroLevel.HNSW\ZeroLevel.HNSW.csproj" /> <ProjectReference Include="..\..\ZeroLevel.HNSW\ZeroLevel.HNSW.csproj" />
</ItemGroup> </ItemGroup>

@ -1,6 +1,7 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Diagnostics; using System.Diagnostics;
using System.Drawing;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
using ZeroLevel.HNSW; using ZeroLevel.HNSW;
@ -89,7 +90,7 @@ namespace HNSWDemo
{ {
var vector = new float[vectorSize]; var vector = new float[vectorSize];
DefaultRandomGenerator.Instance.NextFloats(vector); DefaultRandomGenerator.Instance.NextFloats(vector);
VectorUtils.NormalizeSIMD(vector); //VectorUtils.NormalizeSIMD(vector);
vectors.Add(vector); vectors.Add(vector);
} }
return vectors; return vectors;
@ -98,10 +99,53 @@ namespace HNSWDemo
static void Main(string[] args) static void Main(string[] args)
{ {
TransformToCompactWorldTestWithAccuracity(); var vectors = RandomVectors(128, 3000);
var world = SmallWorld.CreateWorld<float[]>(NSWOptions<float[]>.Create(8, 16, 200, 200, Metrics.L2Euclidean, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple));
world.AddItems(vectors);
DrawHistogram(world, @"D:\hist.jpg");
Console.WriteLine("Completed");
Console.ReadKey(); Console.ReadKey();
} }
static void DrawHistogram(SmallWorld<float[]> world, string filename)
{
var histogram = world.GetHistogram();
/* while (histogram.CountSignChanges() > 3)
{
histogram.Smooth();
}*/
var wb = 1200 / histogram.Values.Length;
var k = 600.0f / (float)histogram.Values.Max();
var maxes = histogram.GetMaximums().ToDictionary(m => m.Index, m => m);
int threshold = histogram.OTSU();
using (var bmp = new Bitmap(1200, 600))
{
using (var g = Graphics.FromImage(bmp))
{
for (int i = 0; i < histogram.Values.Length; i++)
{
var height = (int)(histogram.Values[i] * k);
if (maxes.ContainsKey(i))
{
g.DrawRectangle(Pens.Red, i * wb, bmp.Height - height, wb, height);
g.DrawRectangle(Pens.Red, i * wb + 1, bmp.Height - height, wb - 1, height);
}
else
{
g.DrawRectangle(Pens.Blue, i * wb, bmp.Height - height, wb, height);
}
if (i == threshold)
{
g.DrawLine(Pens.Green, i * wb + wb / 2, 0, i * wb + wb / 2, bmp.Height);
}
}
}
bmp.Save(filename);
}
}
static void TransformToCompactWorldTest() static void TransformToCompactWorldTest()
{ {
var count = 10000; var count = 10000;

@ -0,0 +1,183 @@
using System;
using System.Collections.Generic;
using System.Linq;
namespace ZeroLevel.HNSW
{
public class HistogramValue
{
public int Index { get; internal set; }
public int Value { get; internal set; }
public float MinBound { get; internal set; }
public float MaxBound { get; internal set; }
}
public class Histogram
{
public HistogramMode Mode { get; }
public float Min { get; }
public float Max { get; }
public float BoundsPeriod { get; }
public float[] Bounds { get; }
public int[] Values { get; }
internal Histogram(HistogramMode mode, IList<float> data)
{
Mode = mode;
Min = data.Min();
Max = data.Max();
int M = mode == HistogramMode.LOG ? (int)(1f + 3.2f * Math.Log(data.Count)) : (int)(Math.Sqrt(data.Count));
BoundsPeriod = (Max - Min) / M;
Bounds = new float[M - 1];
float bound = Min + BoundsPeriod;
for (int i = 0; i < Bounds.Length; i++)
{
Bounds[i] = bound;
bound += BoundsPeriod;
}
Values = new int[M];
for (int i = 0; i < Values.Length; i++)
{
Values[i] = 0;
}
foreach (var v in data)
{
if (v < float.Epsilon) continue;
for (int i = 0; i < Bounds.Length; i++)
{
if (v < Bounds[i])
{
Values[i]++;
break;
}
}
}
}
public int Count => Values?.Length ?? 0;
public int CountSignChanges()
{
if ((Values?.Length ?? 0) <= 2) return 0;
int i = 0;
while (Values[i] <= float.Epsilon) { i++; continue; }
if ((Values.Length - i) <= 2) return 0;
var delta = Values[i + 1] - Values[i];
int changes = 0;
i++;
for (; i < Values.Length - 1; i++)
{
var d = Values[i + 1] - Values[i];
if (Math.Abs(d) <= float.Epsilon)
{
continue;
}
if (NumbersHasSameSign(d, delta) == false)
{
delta = d;
changes++;
}
}
return changes;
}
public void Smooth()
{
var buffer = new int[Values.Length];
Array.Copy(Values, buffer, buffer.Length);
for (int i = 2; i < Values.Length - 3; i++)
{
Values[i] = (buffer[i - 2] + buffer[i - 1] + buffer[i] + buffer[i + 1] + buffer[i + 2]) / 5;
}
}
public IEnumerable<HistogramValue> GetMaximums()
{
var list = new List<HistogramValue>();
if ((Values?.Length ?? 0) <= 2) return list;
int i = 0;
while (Values[i] <= float.Epsilon) { i++; continue; }
if ((Values.Length - i) <= 2) return list;
var delta = Values[i + 1] - Values[i];
i++;
for (; i < Values.Length - 1; i++)
{
var d = Values[i + 1] - Values[i];
if (Math.Abs(d) <= float.Epsilon)
{
continue;
}
if (NumbersHasSameSign(d, delta) == false)
{
if (delta > 0)
{
list.Add(new HistogramValue
{
Index = i,
Value = Values[i],
MinBound = Bounds[i - 1],
MaxBound = Bounds[i]
});
}
delta = d;
}
}
return list;
}
#region OTSU "https://en.wikipedia.org/wiki/Otsu's_method"
// function is used to compute the q values in the equation
private float Px(int init, int end)
{
int sum = 0;
int i;
for (i = init; i < end; i++)
sum += Values[i];
return (float)sum;
}
// function is used to compute the mean values in the equation (mu)
private float Mx(int init, int end)
{
int sum = 0;
int i;
for (i = init; i < end; i++)
sum += i * Values[i];
return (float)sum;
}
public int OTSU()
{
float p1, p2, p12;
int k;
int threshold = 0;
float bcv = 0;
for (k = 0; k < Values.Length; k++)
{
p1 = Px(0, k);
p2 = Px(k + 1, Values.Length);
p12 = p1 * p2;
if (p12 == 0)
p12 = 1;
float diff = (Mx(0, k) * p2) - (Mx(k + 1, Values.Length) * p1);
var test = (float)diff * diff / p12;
if (test > bcv)
{
bcv = test;
threshold = k;
}
}
return threshold;
}
#endregion
static bool NumbersHasSameSign(int left, int right)
{
return left >= 0 && right >= 0 || left < 0 && right < 0;
}
}
}

@ -0,0 +1,14 @@
namespace ZeroLevel.HNSW
{
public enum HistogramMode
{
/// <summary>
/// 1 + 3.2 * Ln(LinksCount)
/// </summary>
SQRT,
/// <summary>
/// Sqrt(LinksCount)
/// </summary>
LOG
}
}

@ -254,6 +254,11 @@ namespace ZeroLevel.HNSW
} }
} }
public Histogram CalculateHistogram(HistogramMode mode)
{
return new Histogram(mode, _set.Values);
}
internal float Distance(int id1, int id2) internal float Distance(int id1, int id2)
{ {
long k = (((long)(id1)) << HALF_LONG_BITS) + id2; long k = (((long)(id1)) << HALF_LONG_BITS) + id2;

@ -486,5 +486,7 @@ namespace ZeroLevel.HNSW
{ {
_links.Deserialize(reader); _links.Deserialize(reader);
} }
internal Histogram GetHistogram(HistogramMode mode) => _links.CalculateHistogram(mode);
} }
} }

@ -343,5 +343,8 @@ namespace ZeroLevel.HNSW
} }
} }
} }
public Histogram GetHistogram(HistogramMode mode = HistogramMode.SQRT)
=> _layers[0].GetHistogram(mode);
} }
} }

@ -1,6 +1,6 @@
using System; using System;
namespace ZeroLevel.HNSW.Utils namespace ZeroLevel.HNSW
{ {
public static class Metrics public static class Metrics
{ {

@ -8,7 +8,6 @@ namespace ZeroLevel.HNSW.Services
internal ProbabilityLayerNumberGenerator(int maxLayers, int M) internal ProbabilityLayerNumberGenerator(int maxLayers, int M)
{ {
_mL = maxLayers;
_probabilities = new float[maxLayers]; _probabilities = new float[maxLayers];
var m_L = 1.0f / Math.Log(M); var m_L = 1.0f / Math.Log(M);
for (int i = 0; i < maxLayers; i++) for (int i = 0; i < maxLayers; i++)

Loading…
Cancel
Save

Powered by TurnKey Linux.