You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Zero/ZeroLevel.HNSW/SmallWorld.cs

243 lines
9.5 KiB

3 years ago
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
3 years ago
namespace ZeroLevel.HNSW
{
public class ProbabilityLayerNumberGenerator
{
3 years ago
private const float DIVIDER = 3.361f;
private readonly float[] _probabilities;
public ProbabilityLayerNumberGenerator(int maxLayers, int M)
{
_probabilities = new float[maxLayers];
var probability = 1.0f / DIVIDER;
for (int i = 0; i < maxLayers; i++)
{
_probabilities[i] = probability;
probability /= DIVIDER;
}
}
public int GetRandomLayer()
{
var probability = DefaultRandomGenerator.Instance.NextFloat();
for (int i = 0; i < _probabilities.Length; i++)
{
if (probability > _probabilities[i])
return i;
}
return 0;
}
}
3 years ago
public class SmallWorld<TItem>
{
private readonly NSWOptions<TItem> _options;
private readonly VectorSet<TItem> _vectors;
private readonly Layer<TItem>[] _layers;
3 years ago
private int EntryPoint = 0;
private int MaxLayer = 0;
private readonly ProbabilityLayerNumberGenerator _layerLevelGenerator;
private ReaderWriterLockSlim _lockGraph = new ReaderWriterLockSlim();
3 years ago
public SmallWorld(NSWOptions<TItem> options)
{
_options = options;
_vectors = new VectorSet<TItem>();
_layers = new Layer<TItem>[_options.LayersCount];
_layerLevelGenerator = new ProbabilityLayerNumberGenerator(_options.LayersCount, _options.M);
3 years ago
for (int i = 0; i < _options.LayersCount; i++)
{
_layers[i] = new Layer<TItem>(_options, _vectors);
}
}
/// <summary>
/// Search in the graph K for vectors closest to a given vector
/// </summary>
/// <param name="vector">Given vector</param>
/// <param name="k">Count of elements for search</param>
/// <param name="activeNodes"></param>
/// <returns></returns>
3 years ago
public IEnumerable<(int, TItem, float)> Search(TItem vector, int k, HashSet<int> activeNodes = null)
3 years ago
{
foreach (var pair in KNearest(vector, k, activeNodes))
3 years ago
{
yield return (pair.Item1, _vectors[pair.Item1], pair.Item2);
}
3 years ago
}
/// <summary>
/// Adding vectors batch
/// </summary>
/// <param name="vectors">Vectors</param>
/// <returns>Vector identifiers in a graph</returns>
3 years ago
public int[] AddItems(IEnumerable<TItem> vectors)
{
_lockGraph.EnterWriteLock();
try
3 years ago
{
var ids = _vectors.Append(vectors);
for (int i = 0; i < ids.Length; i++)
{
INSERT(ids[i]);
}
return ids;
}
finally
{
_lockGraph.ExitWriteLock();
3 years ago
}
}
#region https://arxiv.org/ftp/arxiv/papers/1603/1603.09320.pdf
/// <summary>
/// Algorithm 1
/// </summary>
private void INSERT(int q)
3 years ago
{
var distance = new Func<int, float>(candidate => _options.Distance(_vectors[q], _vectors[candidate]));
3 years ago
// W ← ∅ // list for the currently found nearest elements
IDictionary<int, float> W = new Dictionary<int, float>();
3 years ago
// ep ← get enter point for hnsw
3 years ago
var ep = EntryPoint;
var epDist = distance(ep);
3 years ago
// L ← level of ep // top layer for hnsw
var L = MaxLayer;
3 years ago
// l ← ⌊-ln(unif(0..1))∙mL⌋ // new elements level
3 years ago
int l = _layerLevelGenerator.GetRandomLayer();
3 years ago
// for lc ← L … l+1
// Проход с верхнего уровня до уровня где появляется элемент, для нахождения точки входа
for (int lc = L; lc > l; --lc)
3 years ago
{
if (_layers[lc].HasLinks == false)
3 years ago
{
_layers[lc].Append(q);
ep = q;
}
else
{
// W ← SEARCH-LAYER(q, ep, ef = 1, lc)
_layers[lc].KNearestAtLayer(ep, distance, W, 1);
3 years ago
// ep ← get the nearest element from W to q
var nearest = W.OrderBy(p => p.Value).First();
ep = nearest.Key;
epDist = nearest.Value;
W.Clear();
}
3 years ago
}
//for lc ← min(L, l) … 0
// connecting new node to the small world
for (int lc = Math.Min(L, l); lc >= 0; --lc)
3 years ago
{
if (_layers[lc].HasLinks == false)
3 years ago
{
_layers[lc].Append(q);
ep = q;
}
else
3 years ago
{
3 years ago
// W ← SEARCH - LAYER(q, ep, efConstruction, lc)
_layers[lc].KNearestAtLayer(ep, distance, W, _options.EFConstruction);
3 years ago
// neighbors ← SELECT-NEIGHBORS(q, W, M, lc) // alg. 3 or alg. 4
var neighbors = SelectBestForConnecting(lc, distance, W);
// add bidirectionall connectionts from neighbors to q at layer lc
// for each e ∈ neighbors // shrink connections if needed
foreach (var e in neighbors)
{
3 years ago
// eConn ← neighbourhood(e) at layer lc
_layers[lc].AddBidirectionallConnections(q, e.Key, e.Value, lc == 0);
3 years ago
// if distance from newNode to newNeighbour is better than to bestPeer => update bestPeer
if (e.Value < epDist)
{
ep = e.Key;
epDist = e.Value;
}
}
3 years ago
// ep ← W
ep = W.OrderBy(p => p.Value).First().Key;
W.Clear();
3 years ago
}
}
// if l > L
if (l > L)
{
// set enter point for hnsw to q
L = l;
MaxLayer = l;
EntryPoint = ep;
3 years ago
}
}
/// <summary>
/// Get maximum allowed connections for the given level.
/// </summary>
/// <remarks>
/// Article: Section 4.1:
/// "Selection of the Mmax0 (the maximum number of connections that an element can have in the zero layer) also
/// has a strong influence on the search performance, especially in case of high quality(high recall) search.
/// Simulations show that setting Mmax0 to M(this corresponds to kNN graphs on each layer if the neighbors
/// selection heuristic is not used) leads to a very strong performance penalty at high recall.
/// Simulations also suggest that 2∙M is a good choice for Mmax0;
/// setting the parameter higher leads to performance degradation and excessive memory usage."
/// </remarks>
/// <param name="layer">The level of the layer.</param>
/// <returns>The maximum number of connections.</returns>
private int GetM(int layer)
{
return layer == 0 ? 2 * _options.M : _options.M;
}
private IDictionary<int, float> SelectBestForConnecting(int layer, Func<int, float> distance, IDictionary<int, float> candidates)
{
3 years ago
if (_options.SelectionHeuristic == NeighbourSelectionHeuristic.SelectSimple)
return _layers[layer].SELECT_NEIGHBORS_SIMPLE(distance, candidates, GetM(layer));
return _layers[layer].SELECT_NEIGHBORS_HEURISTIC(distance, candidates, GetM(layer));
3 years ago
}
/// <summary>
/// Algorithm 5
/// </summary>
private IEnumerable<(int, float)> KNearest(TItem q, int k, HashSet<int> activeNodes = null)
3 years ago
{
_lockGraph.EnterReadLock();
try
3 years ago
{
if (_vectors.Count == 0)
{
return Enumerable.Empty<(int, float)>();
}
var distance = new Func<int, float>(candidate => _options.Distance(q, _vectors[candidate]));
// W ← ∅ // set for the current nearest elements
var W = new Dictionary<int, float>(k + 1);
// ep ← get enter point for hnsw
var ep = EntryPoint;
// L ← level of ep // top layer for hnsw
var L = MaxLayer;
// for lc ← L … 1
for (int layer = L; layer > 0; --layer)
{
// W ← SEARCH-LAYER(q, ep, ef = 1, lc)
_layers[layer].KNearestAtLayer(ep, distance, W, 1);
// ep ← get nearest element from W to q
ep = W.OrderBy(p => p.Value).First().Key;
W.Clear();
}
// W ← SEARCH-LAYER(q, ep, ef, lc =0)
_layers[0].KNearestAtLayer(ep, distance, W, k, activeNodes);
// return K nearest elements from W to q
return W.Select(p => (p.Key, p.Value));
}
finally
{
_lockGraph.ExitReadLock();
3 years ago
}
}
#endregion
}
}

Powered by TurnKey Linux.