MNIST test
Fix process find of cutoff for bimodal histogram
pull/1/head
unknown 3 years ago
parent adf09b08c8
commit 2d1e4f9d5b

@ -7,118 +7,10 @@ namespace HNSWDemo
{ {
static void Main(string[] args) static void Main(string[] args)
{ {
new AutoClusteringTest().Run(); new AutoClusteringMNISTTest().Run();
//new HistogramTest().Run();
Console.WriteLine("Completed"); Console.WriteLine("Completed");
Console.ReadKey(); Console.ReadKey();
} }
/*
static void TestOnMnist()
{
int imageCount, rowCount, colCount;
var buf = new byte[4];
var image = new byte[28 * 28];
var vectors = new List<float[]>();
using (var fs = new FileStream("t10k-images.idx3-ubyte", FileMode.Open, FileAccess.Read, FileShare.None))
{
// first 4 bytes is a magic number
fs.Read(buf, 0, 4);
// second 4 bytes is the number of images
fs.Read(buf, 0, 4);
imageCount = BitConverter.ToInt32(buf.Reverse().ToArray(), 0);
// third 4 bytes is the row count
fs.Read(buf, 0, 4);
rowCount = BitConverter.ToInt32(buf.Reverse().ToArray(), 0);
// fourth 4 bytes is the column count
fs.Read(buf, 0, 4);
colCount = BitConverter.ToInt32(buf.Reverse().ToArray(), 0);
for (int i = 0; i < imageCount; i++)
{
fs.Read(image, 0, image.Length);
vectors.Add(image.Select(b => (float)b).ToArray());
}
}
//var direct = new VectorsDirectCompare(vectors, Metrics.L2Euclidean);
var options = NSWOptions<float[]>.Create(8, 16, 200, 200, Metrics.L2Euclidean, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple);
SmallWorld<float[]> world;
if (File.Exists("graph.bin"))
{
using (var fs = new FileStream("graph.bin", FileMode.Open, FileAccess.Read, FileShare.None))
{
world = SmallWorld.CreateWorldFrom<float[]>(options, fs);
}
}
else
{
world = SmallWorld.CreateWorld<float[]>(options);
world.AddItems(vectors);
using (var fs = new FileStream("graph.bin", FileMode.Create, FileAccess.Write, FileShare.None))
{
world.Serialize(fs);
}
}
var clusters = AutomaticGraphClusterer.DetectClusters(world);
Console.WriteLine($"Found {clusters.Count} clusters");
for (int i = 0; i < clusters.Count; i++)
{
Console.WriteLine($"Cluster {i + 1} countains {clusters[i].Count} items");
}
}
static void FilterTest()
{
var count = 1000;
var testCount = 100;
var dimensionality = 128;
var samples = Person.GenerateRandom(dimensionality, count);
var testDict = samples.ToDictionary(s => s.Item2.Number, s => s.Item2);
var map = new HNSWMap<long>();
var world = new SmallWorld<float[]>(NSWOptions<float[]>.Create(6, 15, 200, 200, CosineDistance.ForUnits, true, true, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple));
var ids = world.AddItems(samples.Select(i => i.Item1).ToArray());
for (int bi = 0; bi < samples.Count; bi++)
{
map.Append(samples[bi].Item2.Number, ids[bi]);
}
Console.WriteLine("Start test");
int K = 200;
var vectors = RandomVectors(dimensionality, testCount);
var context = new SearchContext()
.SetActiveNodes(map
.ConvertFeaturesToIds(samples
.Where(p => p.Item2.Age > 20 && p.Item2.Age < 50 && p.Item2.Gender == Gender.Feemale)
.Select(p => p.Item2.Number)));
var hits = 0;
var miss = 0;
foreach (var v in vectors)
{
var numbers = map.ConvertIdsToFeatures(world.Search(v, K, context).Select(r => r.Item1));
foreach (var r in numbers)
{
var record = testDict[r];
if (record.Gender == Gender.Feemale && record.Age > 20 && record.Age < 50)
{
hits++;
}
else
{
miss++;
}
}
}
Console.WriteLine($"SUCCESS: {hits}");
Console.WriteLine($"ERROR: {miss}");
}
*/
} }
} }

@ -25,8 +25,8 @@ namespace HNSWDemo.Tests
var sw = new Stopwatch(); var sw = new Stopwatch();
var test = new VectorsDirectCompare(samples, CosineDistance.NonOptimized); var test = new VectorsDirectCompare(samples, Metrics.Cosine);
var world = new SmallWorld<float[]>(NSWOptions<float[]>.Create(8, 12, 100, 100, CosineDistance.NonOptimized)); var world = new SmallWorld<float[]>(NSWOptions<float[]>.Create(8, 12, 100, 100, Metrics.Cosine));
sw.Start(); sw.Start();
var ids = world.AddItems(samples.ToArray()); var ids = world.AddItems(samples.ToArray());

@ -0,0 +1,142 @@
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using ZeroLevel.HNSW;
using ZeroLevel.HNSW.Services;
using ZeroLevel.Services.FileSystem;
namespace HNSWDemo.Tests
{
public class AutoClusteringMNISTTest
: ITest
{
private static int Width = 3000;
private static int Height = 3000;
private static byte[] PadLines(byte[] bytes, int rows, int columns)
{
int currentStride = columns; // 3
int newStride = columns; // 4
byte[] newBytes = new byte[newStride * rows];
for (int i = 0; i < rows; i++)
Buffer.BlockCopy(bytes, currentStride * i, newBytes, newStride * i, currentStride);
return newBytes;
}
public void Run()
{
var folder = @"D:\Mnist";
int columns = 28;
int rows = 28;
int imageCount, rowCount, colCount;
var buf = new byte[4];
var image = new byte[rows * columns];
var vectors = new List<byte[]>();
using (var fs = new FileStream("t10k-images.idx3-ubyte", FileMode.Open, FileAccess.Read, FileShare.None))
{
// first 4 bytes is a magic number
fs.Read(buf, 0, 4);
// second 4 bytes is the number of images
fs.Read(buf, 0, 4);
imageCount = BitConverter.ToInt32(buf.Reverse().ToArray(), 0);
// third 4 bytes is the row count
fs.Read(buf, 0, 4);
rowCount = BitConverter.ToInt32(buf.Reverse().ToArray(), 0);
// fourth 4 bytes is the column count
fs.Read(buf, 0, 4);
colCount = BitConverter.ToInt32(buf.Reverse().ToArray(), 0);
for (int i = 0; i < imageCount; i++)
{
fs.Read(image, 0, image.Length);
var v = new byte[image.Length];
Array.Copy(image, v, image.Length);
vectors.Add(v);
}
}
var options = NSWOptions<byte[]>.Create(8, 16, 200, 200, Metrics.L2Euclidean);
SmallWorld<byte[]> world;
if (File.Exists("graph_mnist.bin"))
{
using (var fs = new FileStream("graph_mnist.bin", FileMode.Open, FileAccess.Read, FileShare.None))
{
world = SmallWorld.CreateWorldFrom<byte[]>(options, fs);
}
}
else
{
world = SmallWorld.CreateWorld<byte[]>(options);
world.AddItems(vectors);
using (var fs = new FileStream("graph_mnist.bin", FileMode.Create, FileAccess.Write, FileShare.None))
{
world.Serialize(fs);
}
}
var distance = new Func<int, int, float>((id1, id2) => Metrics.L2Euclidean(world.GetVector(id1), world.GetVector(id2)));
var links = world.GetLinks().SelectMany(pair => pair.Value.Select(p=> distance(pair.Key, p))).ToList();
var exists = links.Where(n => n > 0).ToArray();
var histogram = new Histogram(HistogramMode.SQRT, links);
DrawHistogram(histogram, @"D:\Mnist\histogram.jpg");
var clusters = AutomaticGraphClusterer.DetectClusters(world);
Console.WriteLine($"Found {clusters.Count} clusters");
for (int i = 0; i < clusters.Count; i++)
{
var ouput = Path.Combine(folder, i.ToString("D3"));
FSUtils.CleanAndTestFolder(ouput);
foreach (var v in clusters[i])
{
int stride = columns;
byte[] newbytes = PadLines(world.GetVector(v), rows, columns);
using (var im = new Bitmap(columns, rows, stride, PixelFormat.Format8bppIndexed, Marshal.UnsafeAddrOfPinnedArrayElement(newbytes, 0)))
{
im.Save(Path.Combine(ouput, $"{v}.bmp"));
}
}
Console.WriteLine($"Cluster {i + 1} countains {clusters[i].Count} items");
}
}
static void DrawHistogram(Histogram histogram, string filename)
{
var wb = Width / histogram.Values.Length;
var k = ((float)Height) / (float)histogram.Values.Max();
var maxes = histogram.GetMaximums().ToDictionary(m => m.Index, m => m);
int threshold = histogram.CuttOff();
using (var bmp = new Bitmap(Width, Height))
{
using (var g = Graphics.FromImage(bmp))
{
for (int i = 0; i < histogram.Values.Length; i++)
{
var height = (int)(histogram.Values[i] * k);
if (maxes.ContainsKey(i))
{
g.DrawRectangle(Pens.Red, i * wb, bmp.Height - height, wb, height);
g.DrawRectangle(Pens.Red, i * wb + 1, bmp.Height - height, wb - 1, height);
}
else
{
g.DrawRectangle(Pens.Blue, i * wb, bmp.Height - height, wb, height);
}
if (i == threshold)
{
g.DrawLine(Pens.Green, i * wb + wb / 2, 0, i * wb + wb / 2, bmp.Height);
}
}
}
bmp.Save(filename);
}
}
}
}

@ -0,0 +1,57 @@
using HNSWDemo.Model;
using System;
using System.Linq;
using ZeroLevel.HNSW;
namespace HNSWDemo.Tests
{
public class FilterTest
: ITest
{
private const int count = 3000;
private const int testCount = 100;
private const int dimensionality = 128;
public void Run()
{
var map = new HNSWMap<long>();
var samples = Person.GenerateRandom(dimensionality, count);
var testDict = samples.ToDictionary(s => s.Item2.Number, s => s.Item2);
var world = new SmallWorld<float[]>(NSWOptions<float[]>.Create(6, 15, 200, 200, CosineDistance.ForUnits));
var ids = world.AddItems(samples.Select(i => i.Item1).ToArray());
for (int bi = 0; bi < samples.Count; bi++)
{
map.Append(samples[bi].Item2.Number, ids[bi]);
}
Console.WriteLine("Start test");
int K = 200;
var vectors = VectorUtils.RandomVectors(dimensionality, testCount);
var context = new SearchContext()
.SetActiveNodes(map
.ConvertFeaturesToIds(samples
.Where(p => p.Item2.Age > 20 && p.Item2.Age < 50 && p.Item2.Gender == Gender.Feemale)
.Select(p => p.Item2.Number)));
var hits = 0;
var miss = 0;
foreach (var v in vectors)
{
var numbers = map.ConvertIdsToFeatures(world.Search(v, K, context).Select(r => r.Item1));
foreach (var r in numbers)
{
var record = testDict[r];
if (context.NodeCheckMode == Mode.None || (record.Gender == Gender.Feemale && record.Age > 20 && record.Age < 50))
{
hits++;
}
else
{
miss++;
}
}
}
Console.WriteLine($"SUCCESS: {hits}");
Console.WriteLine($"ERROR: {miss}");
}
}
}

@ -24,7 +24,7 @@ namespace HNSWDemo.Tests
var histogram = new Histogram(HistogramMode.SQRT, weights); var histogram = new Histogram(HistogramMode.SQRT, weights);
histogram.Smooth(); histogram.Smooth();
int threshold = histogram.OTSU(); int threshold = histogram.CuttOff();
var min = histogram.Bounds[threshold - 1]; var min = histogram.Bounds[threshold - 1];
var max = histogram.Bounds[threshold]; var max = histogram.Bounds[threshold];
var R = (max + min) / 2; var R = (max + min) / 2;
@ -38,7 +38,7 @@ namespace HNSWDemo.Tests
var k = ((float)Height) / (float)histogram.Values.Max(); var k = ((float)Height) / (float)histogram.Values.Max();
var maxes = histogram.GetMaximums().ToDictionary(m => m.Index, m => m); var maxes = histogram.GetMaximums().ToDictionary(m => m.Index, m => m);
int threshold = histogram.OTSU(); int threshold = histogram.CuttOff();
using (var bmp = new Bitmap(Width, Height)) using (var bmp = new Bitmap(Width, Height))
{ {

@ -14,7 +14,7 @@ namespace HNSWDemo.Tests
public void Run() public void Run()
{ {
var sw = new Stopwatch(); var sw = new Stopwatch();
var world = new SmallWorld<float[]>(NSWOptions<float[]>.Create(6, 12, 100, 100, CosineDistance.NonOptimized)); var world = new SmallWorld<float[]>(NSWOptions<float[]>.Create(6, 12, 100, 100, Metrics.Cosine));
for (int i = 0; i < IterationCount; i++) for (int i = 0; i < IterationCount; i++)
{ {
var samples = VectorUtils.RandomVectors(Dimensionality, Count); var samples = VectorUtils.RandomVectors(Dimensionality, Count);

@ -26,11 +26,11 @@ namespace HNSWDemo.Tests
{ {
var v1 = samples[i]; var v1 = samples[i];
var v2 = samples[i + 1]; var v2 = samples[i + 1];
var dist = CosineDistance.NonOptimized(v1, v2); var dist = Metrics.Cosine(v1, v2);
var qv1 = q_samples[i]; var qv1 = q_samples[i];
var qv2 = q_samples[i + 1]; var qv2 = q_samples[i + 1];
var qdist = CosineDistance.NonOptimized(qv1, qv2); var qdist = Metrics.Cosine(qv1, qv2);
list.Add(Math.Abs(dist - qdist)); list.Add(Math.Abs(dist - qdist));
} }

@ -28,8 +28,8 @@ namespace HNSWDemo.Tests
var sw = new Stopwatch(); var sw = new Stopwatch();
var test = new VectorsDirectCompare(s, CosineDistance.NonOptimized); var test = new VectorsDirectCompare(s, Metrics.Cosine);
var world = new SmallWorld<long[]>(NSWOptions<long[]>.Create(6, 8, 100, 100, CosineDistance.NonOptimized)); var world = new SmallWorld<long[]>(NSWOptions<long[]>.Create(6, 8, 100, 100, Metrics.Cosine));
sw.Start(); sw.Start();
var ids = world.AddItems(samples.ToArray()); var ids = world.AddItems(samples.ToArray());

@ -18,15 +18,15 @@ namespace HNSWDemo.Tests
{ {
var vectors = VectorUtils.RandomVectors(Dimensionality, Count); var vectors = VectorUtils.RandomVectors(Dimensionality, Count);
var q = new Quantizator(-1f, 1f); var q = new Quantizator(-1f, 1f);
var world = SmallWorld.CreateWorld<long[]>(NSWOptions<long[]>.Create(8, 16, 200, 200, CosineDistance.NonOptimized)); var world = SmallWorld.CreateWorld<long[]>(NSWOptions<long[]>.Create(8, 16, 200, 200, Metrics.Cosine));
world.AddItems(vectors.Select(v => q.QuantizeToLong(v)).ToList()); world.AddItems(vectors.Select(v => q.QuantizeToLong(v)).ToList());
var distance = new Func<int, int, float>((id1, id2) => CosineDistance.NonOptimized(world.GetVector(id1), world.GetVector(id2))); var distance = new Func<int, int, float>((id1, id2) => Metrics.Cosine(world.GetVector(id1), world.GetVector(id2)));
var weights = world.GetLinks().SelectMany(pair => pair.Value.Select(id => distance(pair.Key, id))); var weights = world.GetLinks().SelectMany(pair => pair.Value.Select(id => distance(pair.Key, id)));
var histogram = new Histogram(HistogramMode.SQRT, weights); var histogram = new Histogram(HistogramMode.SQRT, weights);
histogram.Smooth(); histogram.Smooth();
int threshold = histogram.OTSU(); int threshold = histogram.CuttOff();
var min = histogram.Bounds[threshold - 1]; var min = histogram.Bounds[threshold - 1];
var max = histogram.Bounds[threshold]; var max = histogram.Bounds[threshold];
var R = (max + min) / 2; var R = (max + min) / 2;
@ -40,7 +40,7 @@ namespace HNSWDemo.Tests
var k = ((float)Height) / (float)histogram.Values.Max(); var k = ((float)Height) / (float)histogram.Values.Max();
var maxes = histogram.GetMaximums().ToDictionary(m => m.Index, m => m); var maxes = histogram.GetMaximums().ToDictionary(m => m.Index, m => m);
int threshold = histogram.OTSU(); int threshold = histogram.CuttOff();
using (var bmp = new Bitmap(Width, Height)) using (var bmp = new Bitmap(Width, Height))
{ {

@ -16,7 +16,7 @@ namespace HNSWDemo.Tests
public void Run() public void Run()
{ {
var sw = new Stopwatch(); var sw = new Stopwatch();
var world = new SmallWorld<long[]>(NSWOptions<long[]>.Create(6, 12, 100, 100, CosineDistance.NonOptimized)); var world = new SmallWorld<long[]>(NSWOptions<long[]>.Create(6, 12, 100, 100, Metrics.Cosine));
var q = new Quantizator(-1f, 1f); var q = new Quantizator(-1f, 1f);
for (int i = 0; i < IterationCount; i++) for (int i = 0; i < IterationCount; i++)
{ {

@ -42,7 +42,7 @@ namespace HNSWDemo.Utils
// 1. Find R - bound between intra-cluster distances and out-of-cluster distances // 1. Find R - bound between intra-cluster distances and out-of-cluster distances
var histogram = new Histogram(HistogramMode.SQRT, links.Values); var histogram = new Histogram(HistogramMode.SQRT, links.Values);
int threshold = histogram.OTSU(); int threshold = histogram.CuttOff();
var min = histogram.Bounds[threshold - 1]; var min = histogram.Bounds[threshold - 1];
var max = histogram.Bounds[threshold]; var max = histogram.Bounds[threshold];
var R = (max + min) / 2; var R = (max + min) / 2;

@ -42,7 +42,7 @@ namespace HNSWDemo.Utils
// 1. Find R - bound between intra-cluster distances and out-of-cluster distances // 1. Find R - bound between intra-cluster distances and out-of-cluster distances
var histogram = new Histogram(HistogramMode.SQRT, links.Values); var histogram = new Histogram(HistogramMode.SQRT, links.Values);
int threshold = histogram.OTSU(); int threshold = histogram.CuttOff();
var min = histogram.Bounds[threshold - 1]; var min = histogram.Bounds[threshold - 1];
var max = histogram.Bounds[threshold]; var max = histogram.Bounds[threshold];
var R = (max + min) / 2; var R = (max + min) / 2;

@ -42,7 +42,7 @@ namespace HNSWDemo.Utils
// 1. Find R - bound between intra-cluster distances and out-of-cluster distances // 1. Find R - bound between intra-cluster distances and out-of-cluster distances
var histogram = new Histogram(HistogramMode.SQRT, links.Values); var histogram = new Histogram(HistogramMode.SQRT, links.Values);
int threshold = histogram.OTSU(); int threshold = histogram.CuttOff();
var min = histogram.Bounds[threshold - 1]; var min = histogram.Bounds[threshold - 1];
var max = histogram.Bounds[threshold]; var max = histogram.Bounds[threshold];
var R = (max + min) / 2; var R = (max + min) / 2;

@ -150,7 +150,7 @@ namespace ZeroLevel.HNSW
return (float)sum; return (float)sum;
} }
/*
public int OTSU() public int OTSU()
{ {
float p1, p2, p12; float p1, p2, p12;
@ -172,15 +172,108 @@ namespace ZeroLevel.HNSW
threshold = k; threshold = k;
} }
} }
/* return threshold;
var local_max = Values[threshold]; }
for (int i = threshold + 1; i < Values.Length; i++) */
{ /*
1. Градиент V[I] - V[i-1]
2. Походы окнами от 1 и выше, пока не сойдется к бимодальности
3. Найти cutoff как минимум между пиками
Modes = 0
W = 1
D = [V.count1]
Maxes = []
For I in [1..V.count]
D= V[I] - V[i-1]
do
Modes = 0
S = +1
do
for wnd in D
if wnd.sum > 0 & S < 0
S = +1
Elif wnd.sum < 0 & S > 0
Maxes.push(wnd.maxindex)
Modes ++
S = -1
W++
while Modes > 2
If Modes == 2
Cutoff = Maxes[0]
Min = V[I]
For I=Maxes[0] to Maxes[1]
if V[I] < Min
Min = V[I]
Cutoff = i
*/
public int CuttOff()
{
var grad = new int[Values.Length];
grad[0] = 0;
grad[1] = 0;
for (int k = 2; k < Values.Length; k++)
{
grad[k - 1] = Values[k] - Values[k - 1];
} }
*/ var modes = 0;
return threshold; var window = 0;
var sign = 1;
var sum = 0;
var max = 0;
var maxInd = 0;
var maxes = new List<int>();
do
{
maxes.Clear();
window++;
modes = 0;
sum = 0;
for (int i = 0; i < grad.Length; i += window)
{
sum = grad[i];
max = Values[i];
maxInd = i;
for (var w = 1; w < window && (i + w) < grad.Length; w++)
{
sum += grad[i + w];
if (Values[i + w] > max)
{
max = Values[i + w];
maxInd = i + w;
}
}
if (sum > 0 && sign < 0)
{
sign = 1;
}
else if (sum < 0 && sign > 0)
{
modes++;
maxes.Add(maxInd);
sign = -1;
}
}
} while (modes > 2);
if (modes == 2)
{
var cutoff = maxes[0];
var min = Values[cutoff];
for (int i = maxes[0] + 1; i < maxes[1]; i++)
{
if (Values[i] < min)
{
cutoff = i;
min = Values[i];
}
}
return cutoff;
}
return -1;
} }
#endregion #endregion
static bool NumbersHasSameSign(int left, int right) static bool NumbersHasSameSign(int left, int right)

@ -5,20 +5,22 @@ using System.Runtime.CompilerServices;
namespace ZeroLevel.HNSW namespace ZeroLevel.HNSW
{ {
public sealed class SearchContext public enum Mode
{ {
enum Mode None,
{ ActiveCheck,
None, InactiveCheck,
ActiveCheck, ActiveInactiveCheck
InactiveCheck, }
ActiveInactiveCheck
}
public sealed class SearchContext
{
private HashSet<int> _activeNodes; private HashSet<int> _activeNodes;
private HashSet<int> _entryNodes; private HashSet<int> _entryNodes;
private Mode _mode; private Mode _mode;
public Mode NodeCheckMode => _mode;
public SearchContext() public SearchContext()
{ {
_mode = Mode.None; _mode = Mode.None;

@ -22,7 +22,7 @@ namespace ZeroLevel.HNSW.Services
// 1. Find R - bound between intra-cluster distances and out-of-cluster distances // 1. Find R - bound between intra-cluster distances and out-of-cluster distances
var histogram = new Histogram(HistogramMode.SQRT, links.Select(l => l.Distance)); var histogram = new Histogram(HistogramMode.SQRT, links.Select(l => l.Distance));
int threshold = histogram.OTSU(); int threshold = histogram.CuttOff();
var min = histogram.Bounds[threshold - 1]; var min = histogram.Bounds[threshold - 1];
var max = histogram.Bounds[threshold]; var max = histogram.Bounds[threshold];
var R = (max + min) / 2; var R = (max + min) / 2;

@ -256,14 +256,80 @@ namespace ZeroLevel.HNSW
return W; return W;
} }
internal IEnumerable<(int, float)> KNearestAtLayer(int entryPointId, Func<int, float> targetCosts, int ef, SearchContext context)
{
int farthestId;
float farthestDistance;
var d = targetCosts(entryPointId);
var v = new VisitedBitSet(_vectors.Count, _options.M);
// * v ← ep // set of visited elements
v.Add(entryPointId);
// * C ← ep // set of candidates
var C = new MinHeap(ef);
C.Push((entryPointId, d));
// * W ← ep // dynamic list of found nearest neighbors
var W = new MaxHeap(ef + 1);
if (context.IsActiveNode(entryPointId))
{
W.Push((entryPointId, d));
}
// * while │C│ > 0
while (C.Count > 0)
{
// * c ← extract nearest element from C to q
var c = C.Pop();
// * f ← get furthest element from W to q
// * if distance(c, q) > distance(f, q)
if (W.TryPeek(out _, out farthestDistance) && c.Item2 > farthestDistance)
{
// * break // all elements in W are evaluated
break;
}
// * for each e ∈ neighbourhood(c) at layer lc // update C and W
foreach (var e in GetNeighbors(c.Item1))
{
// * if e ∉ v
if (!v.Contains(e))
{
// * v ← v e
v.Add(e);
// * f ← get furthest element from W to q
W.TryPeek(out farthestId, out farthestDistance);
var eDistance = targetCosts(e);
// * if distance(e, q) < distance(f, q) or │W│ < ef
if (W.Count < ef || (farthestId >= 0 && eDistance < farthestDistance))
{
// * C ← C e
C.Push((e, eDistance));
// * W ← W e
if (context.IsActiveNode(e))
{
W.Push((e, eDistance));
if (W.Count > ef)
{
W.Pop();
}
}
}
}
}
}
C.Clear();
v.Clear();
return W;
}
/// <summary> /// <summary>
/// Algorithm 2 /// Algorithm 2
/// </summary> /// </summary>
/// <param name="q">query element</param> /// <param name="q">query element</param>
/// <param name="ep">enter points ep</param> /// <param name="ep">enter points ep</param>
/// <returns>Output: ef closest neighbors to q</returns> /// <returns>Output: ef closest neighbors to q</returns>
/* internal IEnumerable<(int, float)> KNearestAвtLayer(int entryPointId, Func<int, float> targetCosts, int ef, SearchContext context)
internal IEnumerable<(int, float)> KNearestAtLayer(int entryPointId, Func<int, float> targetCosts, int ef, SearchContext context)
{ {
int farthestId; int farthestId;
float farthestDistance; float farthestDistance;
@ -326,7 +392,6 @@ namespace ZeroLevel.HNSW
v.Clear(); v.Clear();
return W; return W;
} }
*/
/// <summary> /// <summary>
/// Algorithm 2, modified for LookAlike /// Algorithm 2, modified for LookAlike

@ -75,7 +75,8 @@ namespace ZeroLevel.HNSW
public void Serialize(IBinaryWriter writer) public void Serialize(IBinaryWriter writer)
{ {
writer.WriteBoolean(false); // true - set with weights writer.WriteBoolean(false); // true - set with weights
writer.WriteInt32(_set.Sum(pair => pair.Value.Count)); var count = _set.Sum(pair => pair.Value.Count);
writer.WriteInt32(count);
foreach (var record in _set) foreach (var record in _set)
{ {
var id = record.Key; var id = record.Key;
@ -89,9 +90,9 @@ namespace ZeroLevel.HNSW
public void Deserialize(IBinaryReader reader) public void Deserialize(IBinaryReader reader)
{ {
if (reader.ReadBoolean() == false) if (reader.ReadBoolean() != false)
{ {
throw new InvalidOperationException("Incompatible data format. The set does not contain weights."); throw new InvalidOperationException("Incompatible format");
} }
_set.Clear(); _set.Clear();
_set = null; _set = null;

@ -40,6 +40,8 @@ namespace ZeroLevel.HNSW
public SmallWorld(NSWOptions<TItem> options, Stream stream) public SmallWorld(NSWOptions<TItem> options, Stream stream)
{ {
_options = options; _options = options;
_layerLevelGenerator = new ProbabilityLayerNumberGenerator(_options.LayersCount, _options.M);
DistanceFunction = new Func<int, int, float>((id1, id2) => _options.Distance(_vectors[id1], _vectors[id2]));
Deserialize(stream); Deserialize(stream);
} }
@ -57,7 +59,7 @@ namespace ZeroLevel.HNSW
yield return (pair.Item1, _vectors[pair.Item1], pair.Item2); yield return (pair.Item1, _vectors[pair.Item1], pair.Item2);
} }
} }
/*
public IEnumerable<(int, TItem, float)> Search(TItem vector, int k, SearchContext context) public IEnumerable<(int, TItem, float)> Search(TItem vector, int k, SearchContext context)
{ {
if (context == null) if (context == null)
@ -76,6 +78,7 @@ namespace ZeroLevel.HNSW
} }
} }
/*
public IEnumerable<(int, TItem, float)> Search(int k, SearchContext context) public IEnumerable<(int, TItem, float)> Search(int k, SearchContext context)
{ {
if (context == null) if (context == null)
@ -261,7 +264,7 @@ namespace ZeroLevel.HNSW
_lockGraph.ExitReadLock(); _lockGraph.ExitReadLock();
} }
} }
/*
private IEnumerable<(int, float)> KNearest(TItem q, int k, SearchContext context) private IEnumerable<(int, float)> KNearest(TItem q, int k, SearchContext context)
{ {
_lockGraph.EnterReadLock(); _lockGraph.EnterReadLock();
@ -271,6 +274,7 @@ namespace ZeroLevel.HNSW
{ {
return Enumerable.Empty<(int, float)>(); return Enumerable.Empty<(int, float)>();
} }
int id; int id;
float value; float value;
var distance = new Func<int, float>(candidate => _options.Distance(q, _vectors[candidate])); var distance = new Func<int, float>(candidate => _options.Distance(q, _vectors[candidate]));
@ -309,7 +313,7 @@ namespace ZeroLevel.HNSW
_lockGraph.ExitReadLock(); _lockGraph.ExitReadLock();
} }
} }
*/
/* /*
private IEnumerable<(int, float)> KNearest(int k, SearchContext context) private IEnumerable<(int, float)> KNearest(int k, SearchContext context)

@ -27,141 +27,7 @@ namespace ZeroLevel.HNSW
/// <param name="u">Left vector.</param> /// <param name="u">Left vector.</param>
/// <param name="v">Right vector.</param> /// <param name="v">Right vector.</param>
/// <returns>Cosine distance between u and v.</returns> /// <returns>Cosine distance between u and v.</returns>
public static float NonOptimized(float[] u, float[] v)
{
if (u.Length != v.Length)
{
throw new ArgumentException("Vectors have non-matching dimensions");
}
float dot = 0.0f;
float nru = 0.0f;
float nrv = 0.0f;
for (int i = 0; i < u.Length; ++i)
{
dot += u[i] * v[i];
nru += u[i] * u[i];
nrv += v[i] * v[i];
}
var similarity = dot / (float)(Math.Sqrt(nru) * Math.Sqrt(nrv));
return 1 - similarity;
}
public static float NonOptimized(byte[] u, byte[] v)
{
if (u.Length != v.Length)
{
throw new ArgumentException("Vectors have non-matching dimensions");
}
float dot = 0.0f;
float nru = 0.0f;
float nrv = 0.0f;
for (int i = 0; i < u.Length; ++i)
{
dot += (float)(u[i] * v[i]);
nru += (float)(u[i] * u[i]);
nrv += (float)(v[i] * v[i]);
}
var similarity = dot / (float)(Math.Sqrt(nru) * Math.Sqrt(nrv));
return 1 - similarity;
}
public static float NonOptimized(int[] u, int[] v)
{
if (u.Length != v.Length)
{
throw new ArgumentException("Vectors have non-matching dimensions");
}
float dot = 0.0f;
float nru = 0.0f;
float nrv = 0.0f;
byte[] bu;
byte[] bv;
for (int i = 0; i < u.Length; ++i)
{
bu = BitConverter.GetBytes(u[i]);
bv = BitConverter.GetBytes(v[i]);
dot += (float)(bu[0] * bv[0]);
nru += (float)(bu[0] * bu[0]);
nrv += (float)(bv[0] * bv[0]);
dot += (float)(bu[1] * bv[1]);
nru += (float)(bu[1] * bu[1]);
nrv += (float)(bv[1] * bv[1]);
dot += (float)(bu[2] * bv[2]);
nru += (float)(bu[2] * bu[2]);
nrv += (float)(bv[2] * bv[2]);
dot += (float)(bu[3] * bv[3]);
nru += (float)(bu[3] * bu[3]);
nrv += (float)(bv[3] * bv[3]);
}
var similarity = dot / (float)(Math.Sqrt(nru) * Math.Sqrt(nrv));
return 1 - similarity;
}
public static float NonOptimized(long[] u, long[] v)
{
if (u.Length != v.Length)
{
throw new ArgumentException("Vectors have non-matching dimensions");
}
float dot = 0.0f;
float nru = 0.0f;
float nrv = 0.0f;
byte[] bu;
byte[] bv;
for (int i = 0; i < u.Length; ++i)
{
bu = BitConverter.GetBytes(u[i]);
bv = BitConverter.GetBytes(v[i]);
dot += (float)(bu[0] * bv[0]);
nru += (float)(bu[0] * bu[0]);
nrv += (float)(bv[0] * bv[0]);
dot += (float)(bu[1] * bv[1]);
nru += (float)(bu[1] * bu[1]);
nrv += (float)(bv[1] * bv[1]);
dot += (float)(bu[2] * bv[2]);
nru += (float)(bu[2] * bu[2]);
nrv += (float)(bv[2] * bv[2]);
dot += (float)(bu[3] * bv[3]);
nru += (float)(bu[3] * bu[3]);
nrv += (float)(bv[3] * bv[3]);
dot += (float)(bu[4] * bv[4]);
nru += (float)(bu[4] * bu[4]);
nrv += (float)(bv[4] * bv[4]);
dot += (float)(bu[5] * bv[5]);
nru += (float)(bu[5] * bu[5]);
nrv += (float)(bv[5] * bv[5]);
dot += (float)(bu[6] * bv[6]);
nru += (float)(bu[6] * bu[6]);
nrv += (float)(bv[6] * bv[6]);
dot += (float)(bu[7] * bv[7]);
nru += (float)(bu[7] * bu[7]);
nrv += (float)(bv[7] * bv[7]);
}
var similarity = dot / (float)(Math.Sqrt(nru) * Math.Sqrt(nrv));
return 1 - similarity;
}
/// <summary> /// <summary>
/// Calculates cosine distance with assumption that u and v are unit vectors. /// Calculates cosine distance with assumption that u and v are unit vectors.

@ -1,83 +0,0 @@
using System;
namespace ZeroLevel.HNSW
{
public static class Metrics
{
/// <summary>
/// The taxicab metric is also known as rectilinear distance,
/// L1 distance or L1 norm, city block distance, Manhattan distance,
/// or Manhattan length, with the corresponding variations in the name of the geometry.
/// It represents the distance between points in a city road grid.
/// It examines the absolute differences between the coordinates of a pair of objects.
/// </summary>
public static float L1Manhattan(float[] v1, float[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (res);
}
/// <summary>
/// Euclidean distance is the most common use of distance.
/// Euclidean distance, or simply 'distance',
/// examines the root of square differences between the coordinates of a pair of objects.
/// This is most generally known as the Pythagorean theorem.
/// </summary>
public static float L2Euclidean(float[] v1, float[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (float)Math.Sqrt(res);
}
/// <summary>
/// The general metric for distance is the Minkowski distance.
/// When lambda is equal to 1, it becomes the city block distance (L1),
/// and when lambda is equal to 2, it becomes the Euclidean distance (L2).
/// The special case is when lambda is equal to infinity (taking a limit),
/// where it is considered as the Chebyshev distance.
/// </summary>
public static float MinkowskiDistance(float[] v1, float[] v2, int order)
{
int count = v1.Length;
double sum = 0.0;
for (int i = 0; i < count; i++)
{
sum = sum + Math.Pow(Math.Abs(v1[i] - v2[i]), order);
}
return (float)Math.Pow(sum, (1 / order));
}
/// <summary>
/// Chebyshev distance is also called the Maximum value distance,
/// defined on a vector space where the distance between two vectors is
/// the greatest of their differences along any coordinate dimension.
/// In other words, it examines the absolute magnitude of the differences
/// between the coordinates of a pair of objects.
/// </summary>
public static double ChebyshevDistance(float[] v1, float[] v2)
{
int count = v1.Length;
float max = float.MinValue;
float c;
for (int i = 0; i < count; i++)
{
c = Math.Abs(v1[i] - v2[i]);
if (c > max)
{
max = c;
}
}
return max;
}
}
}

@ -0,0 +1,367 @@
using System;
using System.Linq;
namespace ZeroLevel.HNSW
{
public static class Metrics
{
/// <summary>
/// The taxicab metric is also known as rectilinear distance,
/// L1 distance or L1 norm, city block distance, Manhattan distance,
/// or Manhattan length, with the corresponding variations in the name of the geometry.
/// It represents the distance between points in a city road grid.
/// It examines the absolute differences between the coordinates of a pair of objects.
/// </summary>
public static float L1Manhattan(float[] v1, float[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (res);
}
public static float L1Manhattan(byte[] v1, byte[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (res);
}
public static float L1Manhattan(int[] v1, int[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (res);
}
public static float L1Manhattan(long[] v1, long[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (res);
}
/// <summary>
/// Euclidean distance is the most common use of distance.
/// Euclidean distance, or simply 'distance',
/// examines the root of square differences between the coordinates of a pair of objects.
/// This is most generally known as the Pythagorean theorem.
/// </summary>
public static float L2Euclidean(float[] v1, float[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (float)Math.Sqrt(res);
}
public static float L2Euclidean(byte[] v1, byte[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (float)Math.Sqrt(res);
}
public static float L2Euclidean(int[] v1, int[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (float)Math.Sqrt(res);
}
public static float L2Euclidean(long[] v1, long[] v2)
{
float res = 0;
for (int i = 0; i < v1.Length; i++)
{
float t = v1[i] - v2[i];
res += t * t;
}
return (float)Math.Sqrt(res);
}
/// <summary>
/// The general metric for distance is the Minkowski distance.
/// When lambda is equal to 1, it becomes the city block distance (L1),
/// and when lambda is equal to 2, it becomes the Euclidean distance (L2).
/// The special case is when lambda is equal to infinity (taking a limit),
/// where it is considered as the Chebyshev distance.
/// </summary>
public static float MinkowskiDistance(float[] v1, float[] v2, int order)
{
int count = v1.Length;
double sum = 0.0;
for (int i = 0; i < count; i++)
{
sum = sum + Math.Pow(Math.Abs(v1[i] - v2[i]), order);
}
return (float)Math.Pow(sum, (1 / order));
}
public static float MinkowskiDistance(byte[] v1, byte[] v2, int order)
{
int count = v1.Length;
double sum = 0.0;
for (int i = 0; i < count; i++)
{
sum = sum + Math.Pow(Math.Abs(v1[i] - v2[i]), order);
}
return (float)Math.Pow(sum, (1 / order));
}
public static float MinkowskiDistance(int[] v1, int[] v2, int order)
{
int count = v1.Length;
double sum = 0.0;
for (int i = 0; i < count; i++)
{
sum = sum + Math.Pow(Math.Abs(v1[i] - v2[i]), order);
}
return (float)Math.Pow(sum, (1 / order));
}
public static float MinkowskiDistance(long[] v1, long[] v2, int order)
{
int count = v1.Length;
double sum = 0.0;
for (int i = 0; i < count; i++)
{
sum = sum + Math.Pow(Math.Abs(v1[i] - v2[i]), order);
}
return (float)Math.Pow(sum, (1 / order));
}
/// <summary>
/// Chebyshev distance is also called the Maximum value distance,
/// defined on a vector space where the distance between two vectors is
/// the greatest of their differences along any coordinate dimension.
/// In other words, it examines the absolute magnitude of the differences
/// between the coordinates of a pair of objects.
/// </summary>
public static double ChebyshevDistance(float[] v1, float[] v2)
{
int count = v1.Length;
float max = float.MinValue;
float c;
for (int i = 0; i < count; i++)
{
c = Math.Abs(v1[i] - v2[i]);
if (c > max)
{
max = c;
}
}
return max;
}
public static double ChebyshevDistance(byte[] v1, byte[] v2)
{
int count = v1.Length;
float max = float.MinValue;
float c;
for (int i = 0; i < count; i++)
{
c = Math.Abs(v1[i] - v2[i]);
if (c > max)
{
max = c;
}
}
return max;
}
public static double ChebyshevDistance(int[] v1, int[] v2)
{
int count = v1.Length;
float max = float.MinValue;
float c;
for (int i = 0; i < count; i++)
{
c = Math.Abs(v1[i] - v2[i]);
if (c > max)
{
max = c;
}
}
return max;
}
public static double ChebyshevDistance(long[] v1, long[] v2)
{
int count = v1.Length;
float max = float.MinValue;
float c;
for (int i = 0; i < count; i++)
{
c = Math.Abs(v1[i] - v2[i]);
if (c > max)
{
max = c;
}
}
return max;
}
public static float Cosine(float[] u, float[] v)
{
if (u.Length != v.Length)
{
throw new ArgumentException("Vectors have non-matching dimensions");
}
float dot = 0.0f;
float nru = 0.0f;
float nrv = 0.0f;
for (int i = 0; i < u.Length; ++i)
{
dot += u[i] * v[i];
nru += u[i] * u[i];
nrv += v[i] * v[i];
}
var similarity = dot / (float)(Math.Sqrt(nru) * Math.Sqrt(nrv));
return 1 - similarity;
}
public static float Cosine(byte[] u, byte[] v)
{
if (u.Length != v.Length)
{
throw new ArgumentException("Vectors have non-matching dimensions");
}
float dot = 0.0f;
float nru = 0.0f;
float nrv = 0.0f;
for (int i = 0; i < u.Length; ++i)
{
dot += (float)(u[i] * v[i]);
nru += (float)(u[i] * u[i]);
nrv += (float)(v[i] * v[i]);
}
var similarity = dot / (float)(Math.Sqrt(nru) * Math.Sqrt(nrv));
return 1 - similarity;
}
public static float Cosine(int[] u, int[] v)
{
if (u.Length != v.Length)
{
throw new ArgumentException("Vectors have non-matching dimensions");
}
float dot = 0.0f;
float nru = 0.0f;
float nrv = 0.0f;
byte[] bu;
byte[] bv;
for (int i = 0; i < u.Length; ++i)
{
bu = BitConverter.GetBytes(u[i]);
bv = BitConverter.GetBytes(v[i]);
dot += (float)(bu[0] * bv[0]);
nru += (float)(bu[0] * bu[0]);
nrv += (float)(bv[0] * bv[0]);
dot += (float)(bu[1] * bv[1]);
nru += (float)(bu[1] * bu[1]);
nrv += (float)(bv[1] * bv[1]);
dot += (float)(bu[2] * bv[2]);
nru += (float)(bu[2] * bu[2]);
nrv += (float)(bv[2] * bv[2]);
dot += (float)(bu[3] * bv[3]);
nru += (float)(bu[3] * bu[3]);
nrv += (float)(bv[3] * bv[3]);
}
var similarity = dot / (float)(Math.Sqrt(nru) * Math.Sqrt(nrv));
return 1 - similarity;
}
public static float Cosine(long[] u, long[] v)
{
if (u.Length != v.Length)
{
throw new ArgumentException("Vectors have non-matching dimensions");
}
float dot = 0.0f;
float nru = 0.0f;
float nrv = 0.0f;
byte[] bu;
byte[] bv;
for (int i = 0; i < u.Length; ++i)
{
bu = BitConverter.GetBytes(u[i]);
bv = BitConverter.GetBytes(v[i]);
dot += (float)(bu[0] * bv[0]);
nru += (float)(bu[0] * bu[0]);
nrv += (float)(bv[0] * bv[0]);
dot += (float)(bu[1] * bv[1]);
nru += (float)(bu[1] * bu[1]);
nrv += (float)(bv[1] * bv[1]);
dot += (float)(bu[2] * bv[2]);
nru += (float)(bu[2] * bu[2]);
nrv += (float)(bv[2] * bv[2]);
dot += (float)(bu[3] * bv[3]);
nru += (float)(bu[3] * bu[3]);
nrv += (float)(bv[3] * bv[3]);
dot += (float)(bu[4] * bv[4]);
nru += (float)(bu[4] * bu[4]);
nrv += (float)(bv[4] * bv[4]);
dot += (float)(bu[5] * bv[5]);
nru += (float)(bu[5] * bu[5]);
nrv += (float)(bv[5] * bv[5]);
dot += (float)(bu[6] * bv[6]);
nru += (float)(bu[6] * bu[6]);
nrv += (float)(bv[6] * bv[6]);
dot += (float)(bu[7] * bv[7]);
nru += (float)(bu[7] * bu[7]);
nrv += (float)(bv[7] * bv[7]);
}
var similarity = dot / (float)(Math.Sqrt(nru) * Math.Sqrt(nrv));
return 1 - similarity;
}
}
}
Loading…
Cancel
Save

Powered by TurnKey Linux.