using System; using System.Collections.Generic; using System.Linq; using System.Threading; namespace ZeroLevel.HNSW { public class ProbabilityLayerNumberGenerator { private const float DIVIDER = 4.362f; private readonly float[] _probabilities; public ProbabilityLayerNumberGenerator(int maxLayers, int M) { _probabilities = new float[maxLayers]; var probability = 1.0f / DIVIDER; for (int i = 0; i < maxLayers; i++) { _probabilities[i] = probability; probability /= DIVIDER; } } public int GetRandomLayer() { var probability = DefaultRandomGenerator.Instance.NextFloat(); for (int i = 0; i < _probabilities.Length; i++) { if (probability > _probabilities[i]) return i; } return 0; } } public class SmallWorld { private readonly NSWOptions _options; private readonly VectorSet _vectors; private readonly Layer[] _layers; private Layer EnterPointsLayer => _layers[_layers.Length - 1]; private Layer LastLayer => _layers[0]; private int EntryPoint = -1; private int MaxLayer = -1; private readonly ProbabilityLayerNumberGenerator _layerLevelGenerator; private ReaderWriterLockSlim _lockGraph = new ReaderWriterLockSlim(); public SmallWorld(NSWOptions options) { _options = options; _vectors = new VectorSet(); _layers = new Layer[_options.LayersCount]; _layerLevelGenerator = new ProbabilityLayerNumberGenerator(_options.LayersCount, _options.M); for (int i = 0; i < _options.LayersCount; i++) { _layers[i] = new Layer(_options, _vectors); } } public IEnumerable<(int, TItem[])> Search(TItem vector, int k, HashSet activeNodes = null) { return Enumerable.Empty<(int, TItem[])>(); } public int[] AddItems(IEnumerable vectors) { _lockGraph.EnterWriteLock(); try { var ids = _vectors.Append(vectors); for (int i = 0; i < ids.Length; i++) { INSERT(ids[i]); } return ids; } finally { _lockGraph.ExitWriteLock(); } } public void TestLevelGenerator() { var levels = new Dictionary(); for (int i = 0; i < 10000; i++) { var level = _layerLevelGenerator.GetRandomLayer(); if (levels.ContainsKey(level) == false) { levels.Add(level, 1); } else { levels[level] += 1.0f; } } foreach (var pair in levels.OrderBy(l => l.Key)) { Console.WriteLine($"[{pair.Key}]: {pair.Value / 100.0f}% ({pair.Value})"); } } #region https://arxiv.org/ftp/arxiv/papers/1603/1603.09320.pdf /// /// Algorithm 1 /// public void INSERT(int q) { var distance = new Func(candidate => _options.Distance(_vectors[q], _vectors[candidate])); // W ← ∅ // list for the currently found nearest elements IDictionary W = new Dictionary(); // ep ← get enter point for hnsw var ep = EntryPoint == -1 ? 0 : EntryPoint; var epDist = 0.0f; // L ← level of ep // top layer for hnsw var L = MaxLayer; // l ← ⌊-ln(unif(0..1))∙mL⌋ // new element’s level int l = _layerLevelGenerator.GetRandomLayer(); if (L == -1) { L = l; MaxLayer = l; } // for lc ← L … l+1 // Проход с верхнего уровня до уровня где появляется элемент, для нахождения точки входа for (int lc = L; lc > l; --lc) { // W ← SEARCH-LAYER(q, ep, ef = 1, lc) _layers[lc].RunKnnAtLayer(ep, distance, W, 1); // ep ← get the nearest element from W to q var nearest = W.OrderBy(p => p.Value).First(); ep = nearest.Key; epDist = nearest.Value; W.Clear(); } //for lc ← min(L, l) … 0 // connecting new node to the small world for (int lc = Math.Min(L, l); lc >= 0; --lc) { // W ← SEARCH - LAYER(q, ep, efConstruction, lc) _layers[lc].RunKnnAtLayer(ep, distance, W, _options.EFConstruction); // neighbors ← SELECT-NEIGHBORS(q, W, M, lc) // alg. 3 or alg. 4 var neighbors = SelectBestForConnecting(lc, distance, W);; // add bidirectionall connectionts from neighbors to q at layer lc // for each e ∈ neighbors // shrink connections if needed foreach (var e in neighbors) { // eConn ← neighbourhood(e) at layer lc _layers[lc].AddBidirectionallConnectionts(q, e.Key, e.Value); // if distance from newNode to newNeighbour is better than to bestPeer => update bestPeer if (e.Value < epDist) { ep = e.Key; epDist = e.Value; } } // ep ← W ep = W.OrderBy(p => p.Value).First().Key; W.Clear(); } // if l > L if (l > L) { // set enter point for hnsw to q L = l; MaxLayer = l; EntryPoint = ep; } } /// /// Get maximum allowed connections for the given level. /// /// /// Article: Section 4.1: /// "Selection of the Mmax0 (the maximum number of connections that an element can have in the zero layer) also /// has a strong influence on the search performance, especially in case of high quality(high recall) search. /// Simulations show that setting Mmax0 to M(this corresponds to kNN graphs on each layer if the neighbors /// selection heuristic is not used) leads to a very strong performance penalty at high recall. /// Simulations also suggest that 2∙M is a good choice for Mmax0; /// setting the parameter higher leads to performance degradation and excessive memory usage." /// /// The level of the layer. /// The maximum number of connections. internal int GetM(int layer) { return layer == 0 ? 2 * _options.M : _options.M; } private IDictionary SelectBestForConnecting(int layer, Func distance, IDictionary candidates) { return _layers[layer].SELECT_NEIGHBORS_SIMPLE(distance, candidates, GetM(layer)); } /// /// Algorithm 5 /// internal IEnumerable<(int, float)> KNearest(TItem q, int k) { _lockGraph.EnterReadLock(); try { if (_vectors.Count == 0) { return Enumerable.Empty<(int, float)>(); } var distance = new Func(candidate => _options.Distance(q, _vectors[candidate])); // W ← ∅ // set for the current nearest elements var W = new Dictionary(k + 1); // ep ← get enter point for hnsw var ep = EntryPoint; // L ← level of ep // top layer for hnsw var L = MaxLayer; // for lc ← L … 1 for (int layer = L; layer > 0; --layer) { // W ← SEARCH-LAYER(q, ep, ef = 1, lc) _layers[layer].RunKnnAtLayer(ep, distance, W, 1); // ep ← get nearest element from W to q ep = W.OrderBy(p => p.Value).First().Key; W.Clear(); } // W ← SEARCH-LAYER(q, ep, ef, lc =0) _layers[0].RunKnnAtLayer(ep, distance, W, k); // return K nearest elements from W to q return W.Select(p => (p.Key, p.Value)); } finally { _lockGraph.ExitReadLock(); } } #endregion } }