From 50e2e902510295a8e5baca1e25f704c351e4c6a1 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 18 Dec 2021 02:17:56 +0300 Subject: [PATCH] HNSW. Optimized version --- TestHNSW/HNSWDemo/Program.cs | 141 +------ ZeroLevel.HNSW/Services/Layer.cs | 163 ++++---- ZeroLevel.HNSW/Services/OPT/OptLayer.cs | 511 ------------------------ ZeroLevel.HNSW/Services/OPT/OptWorld.cs | 386 ------------------ ZeroLevel.HNSW/SmallWorld.cs | 113 ++++-- ZeroLevel/Services/Pools/ObjectPool.cs | 339 +++++++++++----- temp2/Program.cs | 6 +- 7 files changed, 405 insertions(+), 1254 deletions(-) delete mode 100644 ZeroLevel.HNSW/Services/OPT/OptLayer.cs delete mode 100644 ZeroLevel.HNSW/Services/OPT/OptWorld.cs diff --git a/TestHNSW/HNSWDemo/Program.cs b/TestHNSW/HNSWDemo/Program.cs index 86c033d..5078938 100644 --- a/TestHNSW/HNSWDemo/Program.cs +++ b/TestHNSW/HNSWDemo/Program.cs @@ -6,8 +6,6 @@ using System.IO; using System.Linq; using ZeroLevel.HNSW; using ZeroLevel.HNSW.Services; -using ZeroLevel.HNSW.Services.OPT; -using ZeroLevel.Services.Serialization; namespace HNSWDemo { @@ -167,7 +165,7 @@ namespace HNSWDemo static void Main(string[] args) { - OptAccuracityTest(); + AccuracityTest(); Console.WriteLine("Completed"); Console.ReadKey(); } @@ -611,27 +609,17 @@ namespace HNSWDemo var timewatchesNP = new List(); var timewatchesHNSW = new List(); - var totalOptHits = new List(); - var timewatchesOptHNSW = new List(); - var samples = RandomVectors(dimensionality, count); var sw = new Stopwatch(); - var test = new VectorsDirectCompare(samples, Metrics.L2Euclidean); - var world = new SmallWorld(NSWOptions.Create(8, 15, 200, 200, Metrics.L2Euclidean, true, true, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple)); - - var opt_world = new OptWorld(NSWOptions.Create(8, 15, 200, 200, Metrics.L2Euclidean, true, true, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple)); + var test = new VectorsDirectCompare(samples, CosineDistance.NonOptimized); + var world = new SmallWorld(NSWOptions.Create(6, 12, 100, 100, CosineDistance.NonOptimized, true, true, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple)); sw.Start(); var ids = world.AddItems(samples.ToArray()); sw.Stop(); Console.WriteLine($"Insert {ids.Length} items: {sw.ElapsedMilliseconds} ms"); - - sw.Restart(); - opt_world.AddItems(samples.ToArray()); - sw.Stop(); - Console.WriteLine($"Insert {ids.Length} items in OPT: {sw.ElapsedMilliseconds} ms"); Console.WriteLine("Start test"); @@ -657,142 +645,35 @@ namespace HNSWDemo } } totalHits.Add(hits); - - - sw.Restart(); - result = opt_world.Search(v, K); - sw.Stop(); - - timewatchesOptHNSW.Add(sw.ElapsedMilliseconds); - hits = 0; - foreach (var r in result) - { - if (gt.ContainsKey(r.Item1)) - { - hits++; - } - } - totalOptHits.Add(hits); } Console.WriteLine($"MIN Accuracity: {totalHits.Min() * 100 / K}%"); Console.WriteLine($"AVG Accuracity: {totalHits.Average() * 100 / K}%"); Console.WriteLine($"MAX Accuracity: {totalHits.Max() * 100 / K}%"); - Console.WriteLine($"MIN Opt Accuracity: {totalOptHits.Min() * 100 / K}%"); - Console.WriteLine($"AVG Opt Accuracity: {totalOptHits.Average() * 100 / K}%"); - Console.WriteLine($"MAX Opt Accuracity: {totalOptHits.Max() * 100 / K}%"); - Console.WriteLine($"MIN HNSW TIME: {timewatchesHNSW.Min()} ms"); Console.WriteLine($"AVG HNSW TIME: {timewatchesHNSW.Average()} ms"); Console.WriteLine($"MAX HNSW TIME: {timewatchesHNSW.Max()} ms"); - Console.WriteLine($"MIN Opt HNSW TIME: {timewatchesOptHNSW.Min()} ms"); - Console.WriteLine($"AVG Opt HNSW TIME: {timewatchesOptHNSW.Average()} ms"); - Console.WriteLine($"MAX Opt HNSW TIME: {timewatchesOptHNSW.Max()} ms"); - Console.WriteLine($"MIN NP TIME: {timewatchesNP.Min()} ms"); Console.WriteLine($"AVG NP TIME: {timewatchesNP.Average()} ms"); Console.WriteLine($"MAX NP TIME: {timewatchesNP.Max()} ms"); } - static void OptAccuracityTest() + static void InsertTimeExplosionTest() { - int K = 200; - var count = 5000; - var testCount = 1000; + var count = 1000; + var iterationCount = 1000; var dimensionality = 128; - var timewatchesNP = new List(); - var totalOptHits = new List(); - var timewatchesOptHNSW = new List(); - - var totalRestoredHits = new List(); - var timewatchesRestoredHNSW = new List(); - - var samples = RandomVectors(dimensionality, count); - var sw = new Stopwatch(); - - var test = new VectorsDirectCompare(samples, Metrics.L2Euclidean); - - var opt_world = new OptWorld(NSWOptions.Create(8, 16, 200, 200, Metrics.L2Euclidean, true, true, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple)); - - sw.Restart(); - var ids = opt_world.AddItems(samples.ToArray()); - sw.Stop(); - Console.WriteLine($"Insert {ids.Length} items in OPT: {sw.ElapsedMilliseconds} ms"); - - byte[] dump; - using (var ms = new MemoryStream()) + var world = new SmallWorld(NSWOptions.Create(6, 12, 100, 100, CosineDistance.NonOptimized, true, true, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple)); + for (int i = 0; i < iterationCount; i++) { - opt_world.Serialize(ms); - dump = ms.ToArray(); - } - - SmallWorld compactWorld; - using (var ms = new MemoryStream(dump)) - { - compactWorld = SmallWorld.CreateWorldFrom(NSWOptions.Create(8, 16, 200, 200, Metrics.L2Euclidean, true, true, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple), ms); - } - - - Console.WriteLine("Start test"); - - - var test_vectors = RandomVectors(dimensionality, testCount); - foreach (var v in test_vectors) - { - sw.Restart(); - var gt = test.KNearest(v, K).ToDictionary(p => p.Item1, p => p.Item2); - sw.Stop(); - timewatchesNP.Add(sw.ElapsedMilliseconds); - - sw.Restart(); - var result = opt_world.Search(v, K).ToArray(); - sw.Stop(); - timewatchesOptHNSW.Add(sw.ElapsedMilliseconds); - var hits = 0; - foreach (var r in result) - { - if (gt.ContainsKey(r.Item1)) - { - hits++; - } - } - totalOptHits.Add(hits); - + var samples = RandomVectors(dimensionality, count); sw.Restart(); - result = compactWorld.Search(v, K).ToArray(); + var ids = world.AddItems(samples.ToArray()); sw.Stop(); - timewatchesRestoredHNSW.Add(sw.ElapsedMilliseconds); - hits = 0; - foreach (var r in result) - { - if (gt.ContainsKey(r.Item1)) - { - hits++; - } - } - totalRestoredHits.Add(hits); + Console.WriteLine($"ITERATION: [{i.ToString("D4")}] COUNT: [{ids.Length}] ELAPSEF [{sw.ElapsedMilliseconds} ms]"); } - Console.WriteLine($"MIN Opt Accuracity: {totalOptHits.Min() * 100 / K}%"); - Console.WriteLine($"AVG Opt Accuracity: {totalOptHits.Average() * 100 / K}%"); - Console.WriteLine($"MAX Opt Accuracity: {totalOptHits.Max() * 100 / K}%"); - - Console.WriteLine($"MIN Test Accuracity: {totalRestoredHits.Min() * 100 / K}%"); - Console.WriteLine($"AVG Test Accuracity: {totalRestoredHits.Average() * 100 / K}%"); - Console.WriteLine($"MAX Test Accuracity: {totalRestoredHits.Max() * 100 / K}%"); - - Console.WriteLine($"MIN Opt HNSW TIME: {timewatchesOptHNSW.Min()} ms"); - Console.WriteLine($"AVG Opt HNSW TIME: {timewatchesOptHNSW.Average()} ms"); - Console.WriteLine($"MAX Opt HNSW TIME: {timewatchesOptHNSW.Max()} ms"); - - Console.WriteLine($"MIN Test HNSW TIME: {timewatchesRestoredHNSW.Min()} ms"); - Console.WriteLine($"AVG Test HNSW TIME: {timewatchesRestoredHNSW.Average()} ms"); - Console.WriteLine($"MAX Test HNSW TIME: {timewatchesRestoredHNSW.Max()} ms"); - - Console.WriteLine($"MIN NP TIME: {timewatchesNP.Min()} ms"); - Console.WriteLine($"AVG NP TIME: {timewatchesNP.Average()} ms"); - Console.WriteLine($"MAX NP TIME: {timewatchesNP.Max()} ms"); } } } diff --git a/ZeroLevel.HNSW/Services/Layer.cs b/ZeroLevel.HNSW/Services/Layer.cs index a840cea..3071721 100644 --- a/ZeroLevel.HNSW/Services/Layer.cs +++ b/ZeroLevel.HNSW/Services/Layer.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Linq; +using ZeroLevel.HNSW.Services; using ZeroLevel.Services.Serialization; namespace ZeroLevel.HNSW @@ -72,8 +73,18 @@ namespace ZeroLevel.HNSW } else { - // добавляем связь нового узла к найденному - _links.Add(q, p, qpDistance); + if (nearest.Length == 1 && nearest[0].Item1 == nearest[0].Item2) + { + // убираем связи на самих себя + var id1 = nearest[0].Item1; + var id2 = nearest[0].Item2; + _links.Relink(id1, id2, q, qpDistance, _options.Distance(_vectors[id2], _vectors[q])); + } + else + { + // добавляем связь нового узла к найденному + _links.Add(q, p, qpDistance); + } } } @@ -87,7 +98,7 @@ namespace ZeroLevel.HNSW } #region Implementation of https://arxiv.org/ftp/arxiv/papers/1603/1603.09320.pdf - internal int FingEntryPointAtLayer(Func targetCosts) + internal int FindEntryPointAtLayer(Func targetCosts) { var set = new HashSet(_links.Items().Select(p => p.Item1)); int minId = -1; @@ -110,7 +121,7 @@ namespace ZeroLevel.HNSW /// query element /// enter points ep /// Output: ef closest neighbors to q - internal void KNearestAtLayer(int entryPointId, Func targetCosts, IDictionary W, int ef) + internal IEnumerable<(int, float)> KNearestAtLayer(int entryPointId, Func targetCosts, IEnumerable<(int, float)> w, int ef) { /* * v ← ep // set of visited elements @@ -135,22 +146,25 @@ namespace ZeroLevel.HNSW var v = new VisitedBitSet(_vectors.Count, _options.M); // v ← ep // set of visited elements v.Add(entryPointId); + var W = new MaxHeap(ef + 1); + foreach (var i in w) W.Push(i); + + var d = targetCosts(entryPointId); // C ← ep // set of candidates - var C = new Dictionary(); - C.Add(entryPointId, targetCosts(entryPointId)); + var C = new MinHeap(ef); + C.Push((entryPointId, d)); // W ← ep // dynamic list of found nearest neighbors - W.Add(entryPointId, C[entryPointId]); + W.Push((entryPointId, d)); + + int farthestId; + float farthestDistance; - var popCandidate = new Func<(int, float)>(() => { var pair = C.OrderBy(e => e.Value).First(); C.Remove(pair.Key); return (pair.Key, pair.Value); }); - var fartherFromResult = new Func<(int, float)>(() => { var pair = W.OrderByDescending(e => e.Value).First(); return (pair.Key, pair.Value); }); - var fartherPopFromResult = new Action(() => { var pair = W.OrderByDescending(e => e.Value).First(); W.Remove(pair.Key); }); // run bfs while (C.Count > 0) { // get next candidate to check and expand - var toExpand = popCandidate(); - var farthestResult = fartherFromResult(); - if (toExpand.Item2 > farthestResult.Item2) + var toExpand = C.Pop(); + if (W.TryPeek(out _, out farthestDistance) && toExpand.Item2 > farthestDistance) { // the closest candidate is farther than farthest result break; @@ -164,16 +178,17 @@ namespace ZeroLevel.HNSW if (!v.Contains(neighbourId)) { // enqueue perspective neighbours to expansion list - farthestResult = fartherFromResult(); + W.TryPeek(out farthestId, out farthestDistance); var neighbourDistance = targetCosts(neighbourId); - if (W.Count < ef || neighbourDistance < farthestResult.Item2) + if (W.Count < ef || (farthestId >= 0 && neighbourDistance < farthestDistance)) { - C.Add(neighbourId, neighbourDistance); - W.Add(neighbourId, neighbourDistance); + C.Push((neighbourId, neighbourDistance)); + + W.Push((neighbourId, neighbourDistance)); if (W.Count > ef) { - fartherPopFromResult(); + W.Pop(); } } v.Add(neighbourId); @@ -182,6 +197,7 @@ namespace ZeroLevel.HNSW } C.Clear(); v.Clear(); + return W; } /// @@ -190,7 +206,7 @@ namespace ZeroLevel.HNSW /// query element /// enter points ep /// Output: ef closest neighbors to q - internal void KNearestAtLayer(int entryPointId, Func targetCosts, IDictionary W, int ef, SearchContext context) + internal IEnumerable<(int, float)> KNearestAtLayer(int entryPointId, Func targetCosts, IEnumerable<(int, float)> w, int ef, SearchContext context) { /* * v ← ep // set of visited elements @@ -215,25 +231,28 @@ namespace ZeroLevel.HNSW var v = new VisitedBitSet(_vectors.Count, _options.M); // v ← ep // set of visited elements v.Add(entryPointId); + + var W = new MaxHeap(ef + 1); + foreach (var i in w) W.Push(i); + // C ← ep // set of candidates - var C = new Dictionary(); - C.Add(entryPointId, targetCosts(entryPointId)); + var C = new MinHeap(ef); + var d = targetCosts(entryPointId); + C.Push((entryPointId, d)); // W ← ep // dynamic list of found nearest neighbors if (context.IsActiveNode(entryPointId)) { - W.Add(entryPointId, C[entryPointId]); + W.Push((entryPointId, d)); } - var popCandidate = new Func<(int, float)>(() => { var pair = C.OrderBy(e => e.Value).First(); C.Remove(pair.Key); return (pair.Key, pair.Value); }); - var farthestDistance = new Func(() => { var pair = W.OrderByDescending(e => e.Value).First(); return pair.Value; }); - var fartherPopFromResult = new Action(() => { var pair = W.OrderByDescending(e => e.Value).First(); W.Remove(pair.Key); }); // run bfs while (C.Count > 0) { // get next candidate to check and expand - var toExpand = popCandidate(); + var toExpand = C.Pop(); if (W.Count > 0) { - if (toExpand.Item2 > farthestDistance()) + if(W.TryPeek(out _, out var dist )) + if (toExpand.Item2 > dist) { // the closest candidate is farther than farthest result break; @@ -251,18 +270,18 @@ namespace ZeroLevel.HNSW var neighbourDistance = targetCosts(neighbourId); if (context.IsActiveNode(neighbourId)) { - if (W.Count < ef || (W.Count > 0 && neighbourDistance < farthestDistance())) + if (W.Count < ef || (W.Count > 0 && (W.TryPeek(out _, out var dist) && neighbourDistance < dist))) { - W.Add(neighbourId, neighbourDistance); + W.Push((neighbourId, neighbourDistance)); if (W.Count > ef) { - fartherPopFromResult(); + W.Pop(); } } } if (W.Count < ef) { - C.Add(neighbourId, neighbourDistance); + C.Push((neighbourId, neighbourDistance)); } v.Add(neighbourId); } @@ -270,6 +289,7 @@ namespace ZeroLevel.HNSW } C.Clear(); v.Clear(); + return W; } /// @@ -278,7 +298,7 @@ namespace ZeroLevel.HNSW /// query element /// enter points ep /// Output: ef closest neighbors to q - internal void KNearestAtLayer(IDictionary W, int ef, SearchContext context) + internal IEnumerable<(int, float)> KNearestAtLayer(IEnumerable<(int, float)> w, int ef, SearchContext context) { /* * v ← ep // set of visited elements @@ -303,29 +323,28 @@ namespace ZeroLevel.HNSW // v ← ep // set of visited elements var v = new VisitedBitSet(_vectors.Count, _options.M); // C ← ep // set of candidates - var C = new Dictionary(); + var C = new MinHeap(ef); foreach (var ep in context.EntryPoints) { var neighboursIds = GetNeighbors(ep).ToArray(); for (int i = 0; i < neighboursIds.Length; ++i) { - C.Add(ep, _links.Distance(ep, neighboursIds[i])); + C.Push((ep, _links.Distance(ep, neighboursIds[i]))); } v.Add(ep); } // W ← ep // dynamic list of found nearest neighbors + var W = new MaxHeap(ef + 1); + foreach (var i in w) W.Push(i); - var popCandidate = new Func<(int, float)>(() => { var pair = C.OrderBy(e => e.Value).First(); C.Remove(pair.Key); return (pair.Key, pair.Value); }); - var farthestDistance = new Func(() => { var pair = W.OrderByDescending(e => e.Value).First(); return pair.Value; }); - var fartherPopFromResult = new Action(() => { var pair = W.OrderByDescending(e => e.Value).First(); W.Remove(pair.Key); }); // run bfs while (C.Count > 0) { // get next candidate to check and expand - var toExpand = popCandidate(); + var toExpand = C.Pop(); if (W.Count > 0) { - if (toExpand.Item2 > farthestDistance()) + if (W.TryPeek(out _, out var dist) && toExpand.Item2 > dist) { // the closest candidate is farther than farthest result break; @@ -333,12 +352,12 @@ namespace ZeroLevel.HNSW } if (context.IsActiveNode(toExpand.Item1)) { - if (W.Count < ef || W.Count == 0 || (W.Count > 0 && toExpand.Item2 < farthestDistance())) + if (W.Count < ef || W.Count == 0 || (W.Count > 0 && (W.TryPeek(out _, out var dist) && toExpand.Item2 < dist))) { - W.Add(toExpand.Item1, toExpand.Item2); + W.Push((toExpand.Item1, toExpand.Item2)); if (W.Count > ef) { - fartherPopFromResult(); + W.Pop(); } } } @@ -347,21 +366,21 @@ namespace ZeroLevel.HNSW { while (W.Count > ef) { - fartherPopFromResult(); + W.Pop(); } - return; + return W; } else { foreach (var c in W) { - C.Add(c.Key, c.Value); + C.Push((c.Item1, c.Item2)); } } while (C.Count > 0) { // get next candidate to check and expand - var toExpand = popCandidate(); + var toExpand = C.Pop(); // expand candidate var neighboursIds = GetNeighbors(toExpand.Item1).ToArray(); for (int i = 0; i < neighboursIds.Length; ++i) @@ -373,18 +392,18 @@ namespace ZeroLevel.HNSW var neighbourDistance = _links.Distance(toExpand.Item1, neighbourId); if (context.IsActiveNode(neighbourId)) { - if (W.Count < ef || (W.Count > 0 && neighbourDistance < farthestDistance())) + if (W.Count < ef || (W.Count > 0 && (W.TryPeek(out _, out var dist) && neighbourDistance < dist))) { - W.Add(neighbourId, neighbourDistance); + W.Push((neighbourId, neighbourDistance)); if (W.Count > ef) { - fartherPopFromResult(); + W.Pop(); } } } if (W.Count < ef) { - C.Add(neighbourId, neighbourDistance); + C.Push((neighbourId, neighbourDistance)); } v.Add(neighbourId); } @@ -392,24 +411,24 @@ namespace ZeroLevel.HNSW } C.Clear(); v.Clear(); + return W; } /// /// Algorithm 3 /// - internal IDictionary SELECT_NEIGHBORS_SIMPLE(Func distance, IDictionary candidates, int M) + internal MaxHeap SELECT_NEIGHBORS_SIMPLE(IEnumerable<(int, float)> w, int M) { + var W = new MaxHeap(w.Count()); + foreach (var i in w) W.Push(i); var bestN = M; - var W = new Dictionary(candidates); if (W.Count > bestN) { - var popFarther = new Action(() => { var pair = W.OrderByDescending(e => e.Value).First(); W.Remove(pair.Key); }); while (W.Count > bestN) { - popFarther(); + W.Pop(); } } - // return M nearest elements from C to q return W; } @@ -423,12 +442,13 @@ namespace ZeroLevel.HNSW /// flag indicating whether or not to extend candidate list /// flag indicating whether or not to add discarded elements /// Output: M elements selected by the heuristic - internal IDictionary SELECT_NEIGHBORS_HEURISTIC(Func distance, IDictionary candidates, int M) + internal MaxHeap SELECT_NEIGHBORS_HEURISTIC(Func distance, IEnumerable<(int, float)> w, int M) { // R ← ∅ - var R = new Dictionary(); + var R = new MaxHeap(_options.EFConstruction); // W ← C // working queue for the candidates - var W = new Dictionary(candidates); + var W = new MaxHeap(_options.EFConstruction + 1); + foreach (var i in w) W.Push(i); // if extendCandidates // extend candidates by their neighbors if (_options.ExpandBestSelection) { @@ -436,7 +456,7 @@ namespace ZeroLevel.HNSW // for each e ∈ C foreach (var e in W) { - var neighbors = GetNeighbors(e.Key); + var neighbors = GetNeighbors(e.Item1); // for each e_adj ∈ neighbourhood(e) at layer lc foreach (var e_adj in neighbors) { @@ -450,37 +470,30 @@ namespace ZeroLevel.HNSW // W ← W ⋃ eadj foreach (var id in extendBuffer) { - W[id] = distance(id); + W.Push((id, distance(id))); } } // Wd ← ∅ // queue for the discarded candidates - var Wd = new Dictionary(); - - - var popCandidate = new Func<(int, float)>(() => { var pair = W.OrderBy(e => e.Value).First(); W.Remove(pair.Key); return (pair.Key, pair.Value); }); - var fartherFromResult = new Func<(int, float)>(() => { if (R.Count == 0) return (-1, 0f); var pair = R.OrderByDescending(e => e.Value).First(); return (pair.Key, pair.Value); }); - var popNearestDiscarded = new Func<(int, float)>(() => { var pair = Wd.OrderBy(e => e.Value).First(); Wd.Remove(pair.Key); return (pair.Key, pair.Value); }); - - + var Wd = new MinHeap(_options.EFConstruction); // while │W│ > 0 and │R│< M while (W.Count > 0 && R.Count < M) { // e ← extract nearest element from W to q - var (e, ed) = popCandidate(); - var (fe, fd) = fartherFromResult(); + var (e, ed) = W.Pop(); + var (fe, fd) = R.Pop(); // if e is closer to q compared to any element from R if (R.Count == 0 || ed < fd) { // R ← R ⋃ e - R.Add(e, ed); + R.Push((e, ed)); } else { // Wd ← Wd ⋃ e - Wd.Add(e, ed); + Wd.Push((e, ed)); } } // if keepPrunedConnections // add some of the discarded // connections from Wd @@ -490,8 +503,8 @@ namespace ZeroLevel.HNSW while (Wd.Count > 0 && R.Count < M) { // R ← R ⋃ extract nearest element from Wd to q - var nearest = popNearestDiscarded(); - R[nearest.Item1] = nearest.Item2; + var nearest = Wd.Pop(); + R.Push((nearest.Item1, nearest.Item2)); } } // return R @@ -513,4 +526,4 @@ namespace ZeroLevel.HNSW internal Histogram GetHistogram(HistogramMode mode) => _links.CalculateHistogram(mode); } -} \ No newline at end of file +} diff --git a/ZeroLevel.HNSW/Services/OPT/OptLayer.cs b/ZeroLevel.HNSW/Services/OPT/OptLayer.cs deleted file mode 100644 index e47dad4..0000000 --- a/ZeroLevel.HNSW/Services/OPT/OptLayer.cs +++ /dev/null @@ -1,511 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using ZeroLevel.Services.Serialization; - -namespace ZeroLevel.HNSW.Services.OPT -{ - /// - /// NSW graph - /// - internal sealed class OptLayer - : IBinarySerializable - { - private readonly NSWOptions _options; - private readonly VectorSet _vectors; - private readonly CompactBiDirectionalLinksSet _links; - internal SortedList Links => _links.Links; - - /// - /// There are links е the layer - /// - internal bool HasLinks => (_links.Count > 0); - - /// - /// HNSW layer - /// - /// HNSW graph options - /// General vector set - internal OptLayer(NSWOptions options, VectorSet vectors) - { - _options = options; - _vectors = vectors; - _links = new CompactBiDirectionalLinksSet(); - } - - /// - /// Adding new bidirectional link - /// - /// New node - /// The node with which the connection will be made - /// - /// - internal void AddBidirectionallConnections(int q, int p, float qpDistance, bool isMapLayer) - { - // поиск в ширину ближайших узлов к найденному - var nearest = _links.FindLinksForId(p).ToArray(); - // если у найденного узла максимальное количество связей - // if │eConn│ > Mmax // shrink connections of e - if (nearest.Length >= (isMapLayer ? _options.M * 2 : _options.M)) - { - // ищем связь с самой большой дистанцией - float distance = nearest[0].Item3; - int index = 0; - for (int ni = 1; ni < nearest.Length; ni++) - { - // Если осталась ссылка узла на себя, удаляем ее в первую очередь - if (nearest[ni].Item1 == nearest[ni].Item2) - { - index = ni; - break; - } - if (nearest[ni].Item3 > distance) - { - index = ni; - distance = nearest[ni].Item3; - } - } - // делаем перелинковку вставляя новый узел между найденными - var id1 = nearest[index].Item1; - var id2 = nearest[index].Item2; - _links.Relink(id1, id2, q, qpDistance, _options.Distance(_vectors[id2], _vectors[q])); - } - else - { - if (nearest.Length == 1 && nearest[0].Item1 == nearest[0].Item2) - { - // убираем связи на самих себя - var id1 = nearest[0].Item1; - var id2 = nearest[0].Item2; - _links.Relink(id1, id2, q, qpDistance, _options.Distance(_vectors[id2], _vectors[q])); - } - else - { - // добавляем связь нового узла к найденному - _links.Add(q, p, qpDistance); - } - } - } - - /// - /// Adding a node with a connection to itself - /// - /// - internal void Append(int q) - { - _links.Add(q, q, 0); - } - - #region Implementation of https://arxiv.org/ftp/arxiv/papers/1603/1603.09320.pdf - /// - /// Algorithm 2 - /// - /// query element - /// enter points ep - /// Output: ef closest neighbors to q - internal IEnumerable<(int, float)> KNearestAtLayer(int entryPointId, Func targetCosts, IEnumerable<(int, float)> w, int ef) - { - /* - * v ← ep // set of visited elements - * C ← ep // set of candidates - * W ← ep // dynamic list of found nearest neighbors - * while │C│ > 0 - * c ← extract nearest element from C to q - * f ← get furthest element from W to q - * if distance(c, q) > distance(f, q) - * break // all elements in W are evaluated - * for each e ∈ neighbourhood(c) at layer lc // update C and W - * if e ∉ v - * v ← v ⋃ e - * f ← get furthest element from W to q - * if distance(e, q) < distance(f, q) or │W│ < ef - * C ← C ⋃ e - * W ← W ⋃ e - * if │W│ > ef - * remove furthest element from W to q - * return W - */ - var v = new VisitedBitSet(_vectors.Count, _options.M); - // v ← ep // set of visited elements - v.Add(entryPointId); - var W = new MaxHeap(ef + 1); - foreach (var i in w) W.Push(i); - - var d = targetCosts(entryPointId); - // C ← ep // set of candidates - var C = new MinHeap(ef); - C.Push((entryPointId, d)); - // W ← ep // dynamic list of found nearest neighbors - W.Push((entryPointId, d)); - - int farthestId; - float farthestDistance; - - // run bfs - while (C.Count > 0) - { - // get next candidate to check and expand - var toExpand = C.Pop(); - if (W.TryPeek(out _, out farthestDistance) && toExpand.Item2 > farthestDistance) - { - // the closest candidate is farther than farthest result - break; - } - - // expand candidate - var neighboursIds = GetNeighbors(toExpand.Item1).ToArray(); - for (int i = 0; i < neighboursIds.Length; ++i) - { - int neighbourId = neighboursIds[i]; - if (!v.Contains(neighbourId)) - { - // enqueue perspective neighbours to expansion list - W.TryPeek(out farthestId, out farthestDistance); - - var neighbourDistance = targetCosts(neighbourId); - if (W.Count < ef || (farthestId >= 0 && neighbourDistance < farthestDistance)) - { - C.Push((neighbourId, neighbourDistance)); - - W.Push((neighbourId, neighbourDistance)); - if (W.Count > ef) - { - W.Pop(); - } - } - v.Add(neighbourId); - } - } - } - C.Clear(); - v.Clear(); - return W; - } - - /// - /// Algorithm 2 - /// - /// query element - /// enter points ep - /// Output: ef closest neighbors to q - internal IEnumerable<(int, float)> KNearestAtLayer(int entryPointId, Func targetCosts, IEnumerable<(int, float)> w, int ef, SearchContext context) - { - /* - * v ← ep // set of visited elements - * C ← ep // set of candidates - * W ← ep // dynamic list of found nearest neighbors - * while │C│ > 0 - * c ← extract nearest element from C to q - * f ← get furthest element from W to q - * if distance(c, q) > distance(f, q) - * break // all elements in W are evaluated - * for each e ∈ neighbourhood(c) at layer lc // update C and W - * if e ∉ v - * v ← v ⋃ e - * f ← get furthest element from W to q - * if distance(e, q) < distance(f, q) or │W│ < ef - * C ← C ⋃ e - * W ← W ⋃ e - * if │W│ > ef - * remove furthest element from W to q - * return W - */ - var v = new VisitedBitSet(_vectors.Count, _options.M); - // v ← ep // set of visited elements - v.Add(entryPointId); - - var W = new MaxHeap(ef + 1); - foreach (var i in w) W.Push(i); - - // C ← ep // set of candidates - var C = new MinHeap(ef); - var d = targetCosts(entryPointId); - C.Push((entryPointId, d)); - // W ← ep // dynamic list of found nearest neighbors - if (context.IsActiveNode(entryPointId)) - { - W.Push((entryPointId, d)); - } - // run bfs - while (C.Count > 0) - { - // get next candidate to check and expand - var toExpand = C.Pop(); - if (W.Count > 0) - { - if(W.TryPeek(out _, out var dist )) - if (toExpand.Item2 > dist) - { - // the closest candidate is farther than farthest result - break; - } - } - - // expand candidate - var neighboursIds = GetNeighbors(toExpand.Item1).ToArray(); - for (int i = 0; i < neighboursIds.Length; ++i) - { - int neighbourId = neighboursIds[i]; - if (!v.Contains(neighbourId)) - { - // enqueue perspective neighbours to expansion list - var neighbourDistance = targetCosts(neighbourId); - if (context.IsActiveNode(neighbourId)) - { - if (W.Count < ef || (W.Count > 0 && (W.TryPeek(out _, out var dist) && neighbourDistance < dist))) - { - W.Push((neighbourId, neighbourDistance)); - if (W.Count > ef) - { - W.Pop(); - } - } - } - if (W.Count < ef) - { - C.Push((neighbourId, neighbourDistance)); - } - v.Add(neighbourId); - } - } - } - C.Clear(); - v.Clear(); - return W; - } - - /// - /// Algorithm 2, modified for LookAlike - /// - /// query element - /// enter points ep - /// Output: ef closest neighbors to q - internal IEnumerable<(int, float)> KNearestAtLayer(IEnumerable<(int, float)> w, int ef, SearchContext context) - { - /* - * v ← ep // set of visited elements - * C ← ep // set of candidates - * W ← ep // dynamic list of found nearest neighbors - * while │C│ > 0 - * c ← extract nearest element from C to q - * f ← get furthest element from W to q - * if distance(c, q) > distance(f, q) - * break // all elements in W are evaluated - * for each e ∈ neighbourhood(c) at layer lc // update C and W - * if e ∉ v - * v ← v ⋃ e - * f ← get furthest element from W to q - * if distance(e, q) < distance(f, q) or │W│ < ef - * C ← C ⋃ e - * W ← W ⋃ e - * if │W│ > ef - * remove furthest element from W to q - * return W - */ - // v ← ep // set of visited elements - var v = new VisitedBitSet(_vectors.Count, _options.M); - // C ← ep // set of candidates - var C = new MinHeap(ef); - foreach (var ep in context.EntryPoints) - { - var neighboursIds = GetNeighbors(ep).ToArray(); - for (int i = 0; i < neighboursIds.Length; ++i) - { - C.Push((ep, _links.Distance(ep, neighboursIds[i]))); - } - v.Add(ep); - } - // W ← ep // dynamic list of found nearest neighbors - var W = new MaxHeap(ef + 1); - foreach (var i in w) W.Push(i); - - // run bfs - while (C.Count > 0) - { - // get next candidate to check and expand - var toExpand = C.Pop(); - if (W.Count > 0) - { - if (W.TryPeek(out _, out var dist) && toExpand.Item2 > dist) - { - // the closest candidate is farther than farthest result - break; - } - } - if (context.IsActiveNode(toExpand.Item1)) - { - if (W.Count < ef || W.Count == 0 || (W.Count > 0 && (W.TryPeek(out _, out var dist) && toExpand.Item2 < dist))) - { - W.Push((toExpand.Item1, toExpand.Item2)); - if (W.Count > ef) - { - W.Pop(); - } - } - } - } - if (W.Count > ef) - { - while (W.Count > ef) - { - W.Pop(); - } - return W; - } - else - { - foreach (var c in W) - { - C.Push((c.Item1, c.Item2)); - } - } - while (C.Count > 0) - { - // get next candidate to check and expand - var toExpand = C.Pop(); - // expand candidate - var neighboursIds = GetNeighbors(toExpand.Item1).ToArray(); - for (int i = 0; i < neighboursIds.Length; ++i) - { - int neighbourId = neighboursIds[i]; - if (!v.Contains(neighbourId)) - { - // enqueue perspective neighbours to expansion list - var neighbourDistance = _links.Distance(toExpand.Item1, neighbourId); - if (context.IsActiveNode(neighbourId)) - { - if (W.Count < ef || (W.Count > 0 && (W.TryPeek(out _, out var dist) && neighbourDistance < dist))) - { - W.Push((neighbourId, neighbourDistance)); - if (W.Count > ef) - { - W.Pop(); - } - } - } - if (W.Count < ef) - { - C.Push((neighbourId, neighbourDistance)); - } - v.Add(neighbourId); - } - } - } - C.Clear(); - v.Clear(); - return W; - } - - /// - /// Algorithm 3 - /// - internal MaxHeap SELECT_NEIGHBORS_SIMPLE(IEnumerable<(int, float)> w, int M) - { - var W = new MaxHeap(w.Count()); - foreach (var i in w) W.Push(i); - var bestN = M; - if (W.Count > bestN) - { - while (W.Count > bestN) - { - W.Pop(); - } - } - return W; - } - - - - /// - /// Algorithm 4 - /// - /// base element - /// candidate elements - /// flag indicating whether or not to extend candidate list - /// flag indicating whether or not to add discarded elements - /// Output: M elements selected by the heuristic - internal MaxHeap SELECT_NEIGHBORS_HEURISTIC(Func distance, IEnumerable<(int, float)> w, int M) - { - // R ← ∅ - var R = new MaxHeap(_options.EFConstruction); - // W ← C // working queue for the candidates - var W = new MaxHeap(_options.EFConstruction + 1); - foreach (var i in w) W.Push(i); - // if extendCandidates // extend candidates by their neighbors - if (_options.ExpandBestSelection) - { - var extendBuffer = new HashSet(); - // for each e ∈ C - foreach (var e in W) - { - var neighbors = GetNeighbors(e.Item1); - // for each e_adj ∈ neighbourhood(e) at layer lc - foreach (var e_adj in neighbors) - { - // if eadj ∉ W - if (extendBuffer.Contains(e_adj) == false) - { - extendBuffer.Add(e_adj); - } - } - } - // W ← W ⋃ eadj - foreach (var id in extendBuffer) - { - W.Push((id, distance(id))); - } - } - - // Wd ← ∅ // queue for the discarded candidates - var Wd = new MinHeap(_options.EFConstruction); - // while │W│ > 0 and │R│< M - while (W.Count > 0 && R.Count < M) - { - // e ← extract nearest element from W to q - var (e, ed) = W.Pop(); - var (fe, fd) = R.Pop(); - - // if e is closer to q compared to any element from R - if (R.Count == 0 || - ed < fd) - { - // R ← R ⋃ e - R.Push((e, ed)); - } - else - { - // Wd ← Wd ⋃ e - Wd.Push((e, ed)); - } - } - // if keepPrunedConnections // add some of the discarded // connections from Wd - if (_options.KeepPrunedConnections) - { - // while │Wd│> 0 and │R│< M - while (Wd.Count > 0 && R.Count < M) - { - // R ← R ⋃ extract nearest element from Wd to q - var nearest = Wd.Pop(); - R.Push((nearest.Item1, nearest.Item2)); - } - } - // return R - return R; - } - #endregion - - private IEnumerable GetNeighbors(int id) => _links.FindLinksForId(id).Select(d => d.Item2); - - public void Serialize(IBinaryWriter writer) - { - _links.Serialize(writer); - } - - public void Deserialize(IBinaryReader reader) - { - _links.Deserialize(reader); - } - - internal Histogram GetHistogram(HistogramMode mode) => _links.CalculateHistogram(mode); - } -} diff --git a/ZeroLevel.HNSW/Services/OPT/OptWorld.cs b/ZeroLevel.HNSW/Services/OPT/OptWorld.cs deleted file mode 100644 index c2ba6fe..0000000 --- a/ZeroLevel.HNSW/Services/OPT/OptWorld.cs +++ /dev/null @@ -1,386 +0,0 @@ -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Threading; -using ZeroLevel.Services.Serialization; - -namespace ZeroLevel.HNSW.Services.OPT -{ - public class OptWorld - { - private readonly NSWOptions _options; - private VectorSet _vectors; - private OptLayer[] _layers; - private int EntryPoint = 0; - private int MaxLayer = 0; - private readonly ProbabilityLayerNumberGenerator _layerLevelGenerator; - private ReaderWriterLockSlim _lockGraph = new ReaderWriterLockSlim(); - internal SortedList GetNSWLinks() => _layers[0].Links; - - public OptWorld(NSWOptions options) - { - _options = options; - _vectors = new VectorSet(); - _layers = new OptLayer[_options.LayersCount]; - _layerLevelGenerator = new ProbabilityLayerNumberGenerator(_options.LayersCount, _options.M); - for (int i = 0; i < _options.LayersCount; i++) - { - _layers[i] = new OptLayer(_options, _vectors); - } - } - - public OptWorld(NSWOptions options, Stream stream) - { - _options = options; - Deserialize(stream); - } - - /// - /// Search in the graph K for vectors closest to a given vector - /// - /// Given vector - /// Count of elements for search - /// - /// - public IEnumerable<(int, TItem, float)> Search(TItem vector, int k) - { - foreach (var pair in KNearest(vector, k)) - { - yield return (pair.Item1, _vectors[pair.Item1], pair.Item2); - } - } - - public IEnumerable<(int, TItem, float)> Search(TItem vector, int k, SearchContext context) - { - if (context == null) - { - foreach (var pair in KNearest(vector, k)) - { - yield return (pair.Item1, _vectors[pair.Item1], pair.Item2); - } - } - else - { - foreach (var pair in KNearest(vector, k, context)) - { - yield return (pair.Item1, _vectors[pair.Item1], pair.Item2); - } - } - } - - public IEnumerable<(int, TItem, float)> Search(int k, SearchContext context) - { - if (context == null) - { - throw new ArgumentNullException(nameof(context)); - } - else - { - foreach (var pair in KNearest(k, context)) - { - yield return (pair.Item1, _vectors[pair.Item1], pair.Item2); - } - } - } - - /// - /// Adding vectors batch - /// - /// Vectors - /// Vector identifiers in a graph - public int[] AddItems(IEnumerable vectors) - { - _lockGraph.EnterWriteLock(); - try - { - var ids = _vectors.Append(vectors); - for (int i = 0; i < ids.Length; i++) - { - INSERT(ids[i]); - } - return ids; - } - finally - { - _lockGraph.ExitWriteLock(); - } - } - - #region https://arxiv.org/ftp/arxiv/papers/1603/1603.09320.pdf - /// - /// Algorithm 1 - /// - private void INSERT(int q) - { - var distance = new Func(candidate => _options.Distance(_vectors[q], _vectors[candidate])); - // W ← ∅ // list for the currently found nearest elements - var W = new MinHeap(); - // ep ← get enter point for hnsw - //var ep = _layers[MaxLayer].FingEntryPointAtLayer(distance); - //if(ep == -1) ep = EntryPoint; - var ep = EntryPoint; - var epDist = distance(ep); - // L ← level of ep // top layer for hnsw - var L = MaxLayer; - // l ← ⌊-ln(unif(0..1))∙mL⌋ // new element’s level - int l = _layerLevelGenerator.GetRandomLayer(); - // for lc ← L … l+1 - // Проход с верхнего уровня до уровня где появляется элемент, для нахождения точки входа - - int id; - float value; - for (int lc = L; lc > l; --lc) - { - // W ← SEARCH-LAYER(q, ep, ef = 1, lc) - foreach (var i in _layers[lc].KNearestAtLayer(ep, distance, W, 1)) - { - W.Push(i); - } - // ep ← get the nearest element from W to q - if (W.TryPeek(out id, out value)) - { - ep = id; - epDist = value; - } - W.Clear(); - } - //for lc ← min(L, l) … 0 - // connecting new node to the small world - for (int lc = Math.Min(L, l); lc >= 0; --lc) - { - if (_layers[lc].HasLinks == false) - { - _layers[lc].Append(q); - } - else - { - // W ← SEARCH - LAYER(q, ep, efConstruction, lc) - foreach (var i in _layers[lc].KNearestAtLayer(ep, distance, W, _options.EFConstruction)) - { - W.Push(i); - } - - // ep ← W - if (W.TryPeek(out id, out value)) - { - ep = id; - epDist = value; - } - - // neighbors ← SELECT-NEIGHBORS(q, W, M, lc) // alg. 3 or alg. 4 - var neighbors = SelectBestForConnecting(lc, distance, W); - // add bidirectionall connectionts from neighbors to q at layer lc - // for each e ∈ neighbors // shrink connections if needed - foreach (var e in neighbors) - { - // eConn ← neighbourhood(e) at layer lc - _layers[lc].AddBidirectionallConnections(q, e.Item1, e.Item2, lc == 0); - // if distance from newNode to newNeighbour is better than to bestPeer => update bestPeer - if (e.Item2 < epDist) - { - ep = e.Item1; - epDist = e.Item2; - } - } - W.Clear(); - } - } - // if l > L - if (l > L) - { - // set enter point for hnsw to q - L = l; - MaxLayer = l; - EntryPoint = ep; - } - } - - /// - /// Get maximum allowed connections for the given level. - /// - /// - /// Article: Section 4.1: - /// "Selection of the Mmax0 (the maximum number of connections that an element can have in the zero layer) also - /// has a strong influence on the search performance, especially in case of high quality(high recall) search. - /// Simulations show that setting Mmax0 to M(this corresponds to kNN graphs on each layer if the neighbors - /// selection heuristic is not used) leads to a very strong performance penalty at high recall. - /// Simulations also suggest that 2∙M is a good choice for Mmax0; - /// setting the parameter higher leads to performance degradation and excessive memory usage." - /// - /// The level of the layer. - /// The maximum number of connections. - private int GetM(int layer) - { - return layer == 0 ? 2 * _options.M : _options.M; - } - - private IEnumerable<(int, float)> SelectBestForConnecting(int layer, Func distance, IEnumerable<(int, float)> candidates) - { - if (_options.SelectionHeuristic == NeighbourSelectionHeuristic.SelectSimple) - return _layers[layer].SELECT_NEIGHBORS_SIMPLE(candidates, GetM(layer)); - return _layers[layer].SELECT_NEIGHBORS_HEURISTIC(distance, candidates, GetM(layer)); - } - - /// - /// Algorithm 5 - /// - private IEnumerable<(int, float)> KNearest(TItem q, int k) - { - _lockGraph.EnterReadLock(); - try - { - if (_vectors.Count == 0) - { - return Enumerable.Empty<(int, float)>(); - } - - int id; - float value; - var distance = new Func(candidate => _options.Distance(q, _vectors[candidate])); - - // W ← ∅ // set for the current nearest elements - var W = new MinHeap(k + 1); - // ep ← get enter point for hnsw - var ep = EntryPoint; - // L ← level of ep // top layer for hnsw - var L = MaxLayer; - // for lc ← L … 1 - for (int layer = L; layer > 0; --layer) - { - // W ← SEARCH-LAYER(q, ep, ef = 1, lc) - foreach (var i in _layers[layer].KNearestAtLayer(ep, distance, W, 1)) - { - W.Push(i); - } - // ep ← get nearest element from W to q - if (W.TryPeek(out id, out value)) - { - ep = id; - } - W.Clear(); - } - // W ← SEARCH-LAYER(q, ep, ef, lc =0) - foreach (var i in _layers[0].KNearestAtLayer(ep, distance, W, k)) - { - W.Push(i); - } - // return K nearest elements from W to q - return W; - } - finally - { - _lockGraph.ExitReadLock(); - } - } - private IEnumerable<(int, float)> KNearest(TItem q, int k, SearchContext context) - { - _lockGraph.EnterReadLock(); - try - { - if (_vectors.Count == 0) - { - return Enumerable.Empty<(int, float)>(); - } - int id; - float value; - var distance = new Func(candidate => _options.Distance(q, _vectors[candidate])); - - // W ← ∅ // set for the current nearest elements - var W = new MinHeap(k + 1); - // ep ← get enter point for hnsw - var ep = EntryPoint; - // L ← level of ep // top layer for hnsw - var L = MaxLayer; - // for lc ← L … 1 - for (int layer = L; layer > 0; --layer) - { - // W ← SEARCH-LAYER(q, ep, ef = 1, lc) - foreach (var i in _layers[layer].KNearestAtLayer(ep, distance, W, 1)) - { - W.Push(i); - } - // ep ← get nearest element from W to q - if (W.TryPeek(out id, out value)) - { - ep = id; - } - W.Clear(); - } - // W ← SEARCH-LAYER(q, ep, ef, lc =0) - foreach (var i in _layers[0].KNearestAtLayer(ep, distance, W, k, context)) - { - W.Push(i); - } - // return K nearest elements from W to q - return W; - } - finally - { - _lockGraph.ExitReadLock(); - } - } - - private IEnumerable<(int, float)> KNearest(int k, SearchContext context) - { - _lockGraph.EnterReadLock(); - try - { - if (_vectors.Count == 0) - { - return Enumerable.Empty<(int, float)>(); - } - // W ← ∅ // set for the current nearest elements - var W = new MaxHeap(k + 1); - // W ← SEARCH-LAYER(q, ep, ef, lc =0) - foreach (var i in _layers[0].KNearestAtLayer(W, k, context)) - { - W.Push(i); - } - // return K nearest elements from W to q - return W; - } - finally - { - _lockGraph.ExitReadLock(); - } - } - #endregion - - public void Serialize(Stream stream) - { - using (var writer = new MemoryStreamWriter(stream)) - { - writer.WriteInt32(EntryPoint); - writer.WriteInt32(MaxLayer); - _vectors.Serialize(writer); - writer.WriteInt32(_layers.Length); - foreach (var l in _layers) - { - l.Serialize(writer); - } - } - } - - public void Deserialize(Stream stream) - { - using (var reader = new MemoryStreamReader(stream)) - { - this.EntryPoint = reader.ReadInt32(); - this.MaxLayer = reader.ReadInt32(); - _vectors = new VectorSet(); - _vectors.Deserialize(reader); - var countLayers = reader.ReadInt32(); - _layers = new OptLayer[countLayers]; - for (int i = 0; i < countLayers; i++) - { - _layers[i] = new OptLayer(_options, _vectors); - _layers[i].Deserialize(reader); - } - } - } - - public Histogram GetHistogram(HistogramMode mode = HistogramMode.SQRT) - => _layers[0].GetHistogram(mode); - } -} diff --git a/ZeroLevel.HNSW/SmallWorld.cs b/ZeroLevel.HNSW/SmallWorld.cs index bc65ff8..d70733a 100644 --- a/ZeroLevel.HNSW/SmallWorld.cs +++ b/ZeroLevel.HNSW/SmallWorld.cs @@ -31,7 +31,7 @@ namespace ZeroLevel.HNSW } } - internal SmallWorld(NSWOptions options, Stream stream) + public SmallWorld(NSWOptions options, Stream stream) { _options = options; Deserialize(stream); @@ -116,11 +116,10 @@ namespace ZeroLevel.HNSW { var distance = new Func(candidate => _options.Distance(_vectors[q], _vectors[candidate])); // W ← ∅ // list for the currently found nearest elements - IDictionary W = new Dictionary(); + var W = new MinHeap(_options.EFConstruction + 1); // ep ← get enter point for hnsw - //var ep = _layers[MaxLayer].FingEntryPointAtLayer(distance); - //if(ep == -1) ep = EntryPoint; - var ep = EntryPoint; + var ep = _layers[MaxLayer].FindEntryPointAtLayer(distance); + if(ep == -1) ep = EntryPoint; var epDist = distance(ep); // L ← level of ep // top layer for hnsw var L = MaxLayer; @@ -128,23 +127,23 @@ namespace ZeroLevel.HNSW int l = _layerLevelGenerator.GetRandomLayer(); // for lc ← L … l+1 // Проход с верхнего уровня до уровня где появляется элемент, для нахождения точки входа + + int id; + float value; for (int lc = L; lc > l; --lc) { - if (_layers[lc].HasLinks == false) + // W ← SEARCH-LAYER(q, ep, ef = 1, lc) + foreach (var i in _layers[lc].KNearestAtLayer(ep, distance, W, 1)) { - _layers[lc].Append(q); - ep = q; + W.Push(i); } - else + // ep ← get the nearest element from W to q + if (W.TryPeek(out id, out value)) { - // W ← SEARCH-LAYER(q, ep, ef = 1, lc) - _layers[lc].KNearestAtLayer(ep, distance, W, 1); - // ep ← get the nearest element from W to q - var nearest = W.OrderBy(p => p.Value).First(); - ep = nearest.Key; - epDist = nearest.Value; - W.Clear(); + ep = id; + epDist = value; } + W.Clear(); } //for lc ← min(L, l) … 0 // connecting new node to the small world @@ -153,12 +152,22 @@ namespace ZeroLevel.HNSW if (_layers[lc].HasLinks == false) { _layers[lc].Append(q); - ep = q; } else { // W ← SEARCH - LAYER(q, ep, efConstruction, lc) - _layers[lc].KNearestAtLayer(ep, distance, W, _options.EFConstruction); + foreach (var i in _layers[lc].KNearestAtLayer(ep, distance, W, _options.EFConstruction)) + { + W.Push(i); + } + + // ep ← W + if (W.TryPeek(out id, out value)) + { + ep = id; + epDist = value; + } + // neighbors ← SELECT-NEIGHBORS(q, W, M, lc) // alg. 3 or alg. 4 var neighbors = SelectBestForConnecting(lc, distance, W); // add bidirectionall connectionts from neighbors to q at layer lc @@ -166,16 +175,14 @@ namespace ZeroLevel.HNSW foreach (var e in neighbors) { // eConn ← neighbourhood(e) at layer lc - _layers[lc].AddBidirectionallConnections(q, e.Key, e.Value, lc == 0); + _layers[lc].AddBidirectionallConnections(q, e.Item1, e.Item2, lc == 0); // if distance from newNode to newNeighbour is better than to bestPeer => update bestPeer - if (e.Value < epDist) + if (e.Item2 < epDist) { - ep = e.Key; - epDist = e.Value; + ep = e.Item1; + epDist = e.Item2; } } - // ep ← W - ep = W.OrderBy(p => p.Value).First().Key; W.Clear(); } } @@ -208,10 +215,10 @@ namespace ZeroLevel.HNSW return layer == 0 ? 2 * _options.M : _options.M; } - private IDictionary SelectBestForConnecting(int layer, Func distance, IDictionary candidates) + private IEnumerable<(int, float)> SelectBestForConnecting(int layer, Func distance, IEnumerable<(int, float)> candidates) { if (_options.SelectionHeuristic == NeighbourSelectionHeuristic.SelectSimple) - return _layers[layer].SELECT_NEIGHBORS_SIMPLE(distance, candidates, GetM(layer)); + return _layers[layer].SELECT_NEIGHBORS_SIMPLE(candidates, GetM(layer)); return _layers[layer].SELECT_NEIGHBORS_HEURISTIC(distance, candidates, GetM(layer)); } @@ -227,10 +234,13 @@ namespace ZeroLevel.HNSW { return Enumerable.Empty<(int, float)>(); } + + int id; + float value; var distance = new Func(candidate => _options.Distance(q, _vectors[candidate])); // W ← ∅ // set for the current nearest elements - var W = new Dictionary(k + 1); + var W = new MinHeap(k + 1); // ep ← get enter point for hnsw var ep = EntryPoint; // L ← level of ep // top layer for hnsw @@ -239,21 +249,31 @@ namespace ZeroLevel.HNSW for (int layer = L; layer > 0; --layer) { // W ← SEARCH-LAYER(q, ep, ef = 1, lc) - _layers[layer].KNearestAtLayer(ep, distance, W, 1); + foreach (var i in _layers[layer].KNearestAtLayer(ep, distance, W, 1)) + { + W.Push(i); + } // ep ← get nearest element from W to q - ep = W.OrderBy(p => p.Value).First().Key; + if (W.TryPeek(out id, out value)) + { + ep = id; + } W.Clear(); } // W ← SEARCH-LAYER(q, ep, ef, lc =0) - _layers[0].KNearestAtLayer(ep, distance, W, k); + foreach (var i in _layers[0].KNearestAtLayer(ep, distance, W, k)) + { + W.Push(i); + } // return K nearest elements from W to q - return W.Select(p => (p.Key, p.Value)); + return W; } finally { _lockGraph.ExitReadLock(); } } + private IEnumerable<(int, float)> KNearest(TItem q, int k, SearchContext context) { _lockGraph.EnterReadLock(); @@ -263,10 +283,12 @@ namespace ZeroLevel.HNSW { return Enumerable.Empty<(int, float)>(); } + int id; + float value; var distance = new Func(candidate => _options.Distance(q, _vectors[candidate])); // W ← ∅ // set for the current nearest elements - var W = new Dictionary(k + 1); + var W = new MinHeap(k + 1); // ep ← get enter point for hnsw var ep = EntryPoint; // L ← level of ep // top layer for hnsw @@ -275,15 +297,24 @@ namespace ZeroLevel.HNSW for (int layer = L; layer > 0; --layer) { // W ← SEARCH-LAYER(q, ep, ef = 1, lc) - _layers[layer].KNearestAtLayer(ep, distance, W, 1); + foreach (var i in _layers[layer].KNearestAtLayer(ep, distance, W, 1)) + { + W.Push(i); + } // ep ← get nearest element from W to q - ep = W.OrderBy(p => p.Value).First().Key; + if (W.TryPeek(out id, out value)) + { + ep = id; + } W.Clear(); } // W ← SEARCH-LAYER(q, ep, ef, lc =0) - _layers[0].KNearestAtLayer(ep, distance, W, k, context); + foreach (var i in _layers[0].KNearestAtLayer(ep, distance, W, k, context)) + { + W.Push(i); + } // return K nearest elements from W to q - return W.Select(p => (p.Key, p.Value)); + return W; } finally { @@ -300,13 +331,15 @@ namespace ZeroLevel.HNSW { return Enumerable.Empty<(int, float)>(); } - var distance = new Func((id1, id2) => _options.Distance(_vectors[id1], _vectors[id2])); // W ← ∅ // set for the current nearest elements - var W = new Dictionary(k + 1); + var W = new MinHeap(k + 1); // W ← SEARCH-LAYER(q, ep, ef, lc =0) - _layers[0].KNearestAtLayer(W, k, context); + foreach (var i in _layers[0].KNearestAtLayer(W, k, context)) + { + W.Push(i); + } // return K nearest elements from W to q - return W.Select(p => (p.Key, p.Value)); + return W; } finally { diff --git a/ZeroLevel/Services/Pools/ObjectPool.cs b/ZeroLevel/Services/Pools/ObjectPool.cs index e4f2691..e26f698 100644 --- a/ZeroLevel/Services/Pools/ObjectPool.cs +++ b/ZeroLevel/Services/Pools/ObjectPool.cs @@ -1,166 +1,287 @@ using System; +using System.Collections.Generic; using System.Diagnostics; using System.Threading; namespace ZeroLevel.Services.Pools { - /// - /// Steal from Roslyn - /// https://github.com/dotnet/roslyn/blob/master/src/Dependencies/PooledObjects/ObjectPool%601.cs - /// - public class ObjectPool where T : class + public enum LoadingMode { Eager, Lazy, LazyExpanding }; + + public enum AccessMode { FIFO, LIFO, Circular }; + + public sealed class Pool : IDisposable { - [DebuggerDisplay("{Value,nq}")] - private struct Element + private bool isDisposed; + private Func, T> factory; + private LoadingMode loadingMode; + private IItemStore itemStore; + private int size; + private int count; + private Semaphore sync; + + public Pool(int size, Func, T> factory) + : this(size, factory, LoadingMode.Lazy, AccessMode.FIFO) { - internal T Value; } - /// - /// Not using System.Func{T} because this file is linked into the (debugger) Formatter, - /// which does not have that type (since it compiles against .NET 2.0). - /// - public delegate T Factory(); + public Pool(int size, Func, T> factory, + LoadingMode loadingMode, AccessMode accessMode) + { + if (size <= 0) + throw new ArgumentOutOfRangeException("size", size, + "Argument 'size' must be greater than zero."); + if (factory == null) + throw new ArgumentNullException("factory"); - // Storage for the pool objects. The first item is stored in a dedicated field because we - // expect to be able to satisfy most requests from it. - private T _firstItem; + this.size = size; + this.factory = factory; + sync = new Semaphore(size, size); + this.loadingMode = loadingMode; + this.itemStore = CreateItemStore(accessMode, size); + if (loadingMode == LoadingMode.Eager) + { + PreloadItems(); + } + } - private readonly Element[] _items; + public T Acquire() + { + sync.WaitOne(); + switch (loadingMode) + { + case LoadingMode.Eager: + return AcquireEager(); + case LoadingMode.Lazy: + return AcquireLazy(); + default: + Debug.Assert(loadingMode == LoadingMode.LazyExpanding, + "Unknown LoadingMode encountered in Acquire method."); + return AcquireLazyExpanding(); + } + } - // factory is stored for the lifetime of the pool. We will call this only when pool needs to - // expand. compared to "new T()", Func gives more flexibility to implementers and faster - // than "new T()". - private readonly Factory _factory; + public void Release(T item) + { + lock (itemStore) + { + itemStore.Store(item); + } + sync.Release(); + } - public int Count => _items?.Length ?? 0; + public void Dispose() + { + if (isDisposed) + { + return; + } + isDisposed = true; + if (typeof(IDisposable).IsAssignableFrom(typeof(T))) + { + lock (itemStore) + { + while (itemStore.Count > 0) + { + IDisposable disposable = (IDisposable)itemStore.Fetch(); + disposable.Dispose(); + } + } + } + sync.Close(); + } - public ObjectPool(Factory factory) - : this(factory, Environment.ProcessorCount * 2) - { } + #region Acquisition - public ObjectPool(Factory factory, int size) + private T AcquireEager() { - Debug.Assert(size >= 1); - _factory = factory; - _items = new Element[size - 1]; + lock (itemStore) + { + return itemStore.Fetch(); + } } - private T CreateInstance() + private T AcquireLazy() { - var inst = _factory(); - return inst; + lock (itemStore) + { + if (itemStore.Count > 0) + { + return itemStore.Fetch(); + } + } + Interlocked.Increment(ref count); + return factory(this); } - /// - /// Produces an instance. - /// - /// - /// Search strategy is a simple linear probing which is chosen for it cache-friendliness. - /// Note that Free will try to store recycled objects close to the start thus statistically - /// reducing how far we will typically search. - /// - public T Allocate() + private T AcquireLazyExpanding() { - // PERF: Examine the first element. If that fails, AllocateSlow will look at the remaining elements. - // Note that the initial read is optimistically not synchronized. That is intentional. - // We will interlock only when we have a candidate. in a worst case we may miss some - // recently returned objects. Not a big deal. - T inst = _firstItem; - if (inst == null || inst != Interlocked.CompareExchange(ref _firstItem, null, inst)) + bool shouldExpand = false; + if (count < size) + { + int newCount = Interlocked.Increment(ref count); + if (newCount <= size) + { + shouldExpand = true; + } + else + { + // Another thread took the last spot - use the store instead + Interlocked.Decrement(ref count); + } + } + if (shouldExpand) + { + return factory(this); + } + else { - inst = AllocateSlow(); + lock (itemStore) + { + return itemStore.Fetch(); + } } - return inst; } - private T AllocateSlow() + private void PreloadItems() { - var items = _items; + for (int i = 0; i < size; i++) + { + T item = factory(this); + itemStore.Store(item); + } + count = size; + } + + #endregion + + #region Collection Wrappers - for (int i = 0; i < items.Length; i++) + interface IItemStore + { + T Fetch(); + void Store(T item); + int Count { get; } + } + + private IItemStore CreateItemStore(AccessMode mode, int capacity) + { + switch (mode) { - // Note that the initial read is optimistically not synchronized. That is intentional. - // We will interlock only when we have a candidate. in a worst case we may miss some - // recently returned objects. Not a big deal. - T inst = items[i].Value; - if (inst != null) - { - if (inst == Interlocked.CompareExchange(ref items[i].Value, null, inst)) - { - return inst; - } - } + case AccessMode.FIFO: + return new QueueStore(capacity); + case AccessMode.LIFO: + return new StackStore(capacity); + default: + Debug.Assert(mode == AccessMode.Circular, + "Invalid AccessMode in CreateItemStore"); + return new CircularStore(capacity); + } + } + + class QueueStore : Queue, IItemStore + { + public QueueStore(int capacity) : base(capacity) + { + } + + public T Fetch() + { + return Dequeue(); } - return CreateInstance(); + public void Store(T item) + { + Enqueue(item); + } } - /// - /// Returns objects to the pool. - /// - /// - /// Search strategy is a simple linear probing which is chosen for it cache-friendliness. - /// Note that Free will try to store recycled objects close to the start thus statistically - /// reducing how far we will typically search in Allocate. - /// - public void Free(T obj) + class StackStore : Stack, IItemStore { - if (!Validate(obj)) + public StackStore(int capacity) : base(capacity) { - return; } - if (_firstItem == null) + + public T Fetch() { - // Intentionally not using interlocked here. - // In a worst case scenario two objects may be stored into same slot. - // It is very unlikely to happen and will only mean that one of the objects will get collected. - _firstItem = obj; + return Pop(); } - else + + public void Store(T item) { - FreeSlow(obj); + Push(item); } } - private void FreeSlow(T obj) + class CircularStore : IItemStore { - var items = _items; - for (int i = 0; i < items.Length; i++) + private List slots; + private int freeSlotCount; + private int position = -1; + + public CircularStore(int capacity) { - if (items[i].Value == null) + slots = new List(capacity); + } + + public T Fetch() + { + if (Count == 0) + throw new InvalidOperationException("The buffer is empty."); + + int startPosition = position; + do + { + Advance(); + Slot slot = slots[position]; + if (!slot.IsInUse) + { + slot.IsInUse = true; + --freeSlotCount; + return slot.Item; + } + } while (startPosition != position); + throw new InvalidOperationException("No free slots."); + } + + public void Store(T item) + { + Slot slot = slots.Find(s => object.Equals(s.Item, item)); + if (slot == null) { - // Intentionally not using interlocked here. - // In a worst case scenario two objects may be stored into same slot. - // It is very unlikely to happen and will only mean that one of the objects will get collected. - items[i].Value = obj; - break; + slot = new Slot(item); + slots.Add(slot); } + slot.IsInUse = false; + ++freeSlotCount; } - } - private bool Validate(object obj) - { - if (obj == null) return false; - if (_firstItem == obj) return false; - var items = _items; - for (int i = 0; i < items.Length; i++) + public int Count { - var value = items[i].Value; - if (value == null) + get { return freeSlotCount; } + } + + private void Advance() + { + position = (position + 1) % slots.Count; + } + + class Slot + { + public Slot(T item) { - return true; + this.Item = item; } - if (value == obj) return false; + + public T Item { get; private set; } + public bool IsInUse { get; set; } } - return true; } - } - - /* - Alternate - https://stackoverflow.com/questions/1698738/objectpoolt-or-similar-for-net-already-in-a-library + #endregion - */ + public bool IsDisposed + { + get { return isDisposed; } + } + } } \ No newline at end of file diff --git a/temp2/Program.cs b/temp2/Program.cs index 1798fcc..25c7945 100644 --- a/temp2/Program.cs +++ b/temp2/Program.cs @@ -12,10 +12,10 @@ namespace temp2 { static void Main(string[] args) { - OptWorld world; + SmallWorld world; using (var ms = new FileStream(@"F:\graph_test.bin", FileMode.Open, FileAccess.Read, FileShare.None)) { - world = new OptWorld(NSWOptions.Create(6, 12, 100, 10, Metrics.L2Euclidean, true, true, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple), ms); + world = new SmallWorld(NSWOptions.Create(6, 12, 100, 10, Metrics.L2Euclidean, true, true, selectionHeuristic: NeighbourSelectionHeuristic.SelectSimple), ms); } var test_vectors = new List(); @@ -34,7 +34,7 @@ namespace temp2 Console.WriteLine("Completed"); } - static void Forward(OptWorld world, List test_vectors) + static void Forward(SmallWorld world, List test_vectors) { int K = 10; foreach (var v in test_vectors)