Append BloomFilter and HyperBloomBloom
Append hash functions: XXH3
pull/1/head
Ogoun 5 years ago
parent f25acac14f
commit 73753c00b9

@ -1,5 +1,4 @@
using Newtonsoft.Json; using Newtonsoft.Json;
using System;
using ZeroLevel; using ZeroLevel;
using ZeroLevel.Logging; using ZeroLevel.Logging;

@ -0,0 +1,111 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using Xunit;
using ZeroLevel.DataStructures;
namespace ZeroLevel.UnitTests
{
public class BloomFilterTest
{
private static Random random = new Random();
public static string RandomString(int length)
{
const string chars = "abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
return new string(Enumerable.Repeat(chars, length)
.Select(s => s[random.Next(s.Length)]).ToArray());
}
[Fact]
public void SimpleBloomFilterTest()
{
// Arrange
var size = 100000;
var lines = new HashSet<string>(size);
var lines_another = new HashSet<string>(size);
for (int i = 0; i < size; i++)
{
lines.Add(RandomString(i % 9 + 5));
lines_another.Add(RandomString(i % 9 + 5));
}
var bloom = new BloomFilter(16536 * 1024, true);
// Act
var sw = new Stopwatch();
sw.Start();
foreach (var line in lines)
{
bloom.Add(line);
}
sw.Stop();
Debug.Print($"BloomFilter. Append {lines.Count} items. {sw.ElapsedMilliseconds} ms");
// Assert
foreach (var line in lines)
{
Assert.True(bloom.Contains(line));
}
int collision_count = 0;
foreach (var line in lines_another)
{
if (bloom.Contains(line))
{
if (false == lines.Contains(line))
{
collision_count++;
}
}
}
Debug.WriteLine($"Collision for string: {collision_count}.");
}
[Fact]
public void HyperBloomBloomFilterTest()
{
// Arrange
var size = 100000;
var lines = new HashSet<string>(size);
var lines_another = new HashSet<string>(size);
for (int i = 0; i < size; i++)
{
lines.Add(RandomString(i % 9 + 5));
lines_another.Add(RandomString(i % 9 + 5));
}
var bloom = new HyperBloomBloom(16536 * 1024, true);
// Act
var sw = new Stopwatch();
sw.Start();
foreach (var line in lines)
{
bloom.Add(line);
}
sw.Stop();
Debug.Print($"BloomFilter. Append {lines.Count} items. {sw.ElapsedMilliseconds} ms");
// Assert
foreach (var line in lines)
{
Assert.True(bloom.Contains(line));
}
int collision_count = 0;
foreach (var line in lines_another)
{
if (bloom.Contains(line))
{
if (false == lines.Contains(line))
{
collision_count++;
}
}
}
Debug.WriteLine($"Collision for string: {collision_count}.");
}
}
}

@ -0,0 +1,176 @@
using System;
using System.Collections;
using System.Linq;
using System.Runtime.CompilerServices;
using ZeroLevel.Services.HashFunctions;
namespace ZeroLevel.DataStructures
{
/// <summary>
/// Bloom filter implementation, 128 bit
/// </summary>
public class BloomFilter
{
#region Private
private struct HIND
{
public ulong PrimiryDirect;
public uint SecondDirect;
public uint ThirdDirect;
public ulong PrimiryReverse;
public uint SecondReverse;
public uint ThirdReverse;
}
private readonly BitArray _primary;
private readonly BitArray _second;
private readonly BitArray _third;
private readonly BitArray _r_primary;
private readonly BitArray _r_second;
private readonly BitArray _r_third;
private readonly bool _use_reverse = false;
#endregion
public BloomFilter(int bit_size, bool use_reverse)
{
_use_reverse = use_reverse;
_primary = new BitArray(bit_size);
_second = new BitArray(bit_size);
_third = new BitArray(bit_size);
if (_use_reverse)
{
_r_primary = new BitArray(bit_size);
_r_second = new BitArray(bit_size);
_r_third = new BitArray(bit_size);
}
}
public void Add(string item)
{
if (item == null || item.Length == 0) return;
var hind = Compute(item);
Add(hind);
}
public bool Contains(string item)
{
if (item == null || item.Length == 0) return true;
var hind = Compute(item);
return Contains(hind);
}
/// <summary>
/// true if added, false if already exists
/// </summary>
public bool TryAdd(string item)
{
if (item == null || item.Length == 0) return false;
var hind = Compute(item);
if (Contains(hind))
{
return false;
}
Add(hind);
return true;
}
private HIND Compute(string line)
{
var hind = new HIND
{
PrimiryDirect = HashUL(line),
SecondDirect = HashXX(line),
ThirdDirect = HashMM(line)
};
if(_use_reverse)
{
var r = Reverse(line);
hind.PrimiryReverse = HashUL(r);
hind.SecondReverse = HashXX(r);
hind.ThirdReverse = HashMM(r);
}
return hind;
}
public static string Reverse(string s)
{
char[] charArray = s.ToCharArray();
Array.Reverse(charArray);
return new string(charArray);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void Add(HIND hind)
{
int pi = (int)(hind.PrimiryDirect % (ulong)_primary.Length);
_primary[pi] = true;
int si = (int)(hind.SecondDirect % (uint)_second.Length);
_second[si] = true;
int ti = (int)(hind.ThirdDirect % (uint)_third.Length);
_third[ti] = true;
if (_use_reverse)
{
int rpi = (int)(hind.PrimiryReverse % (ulong)_primary.Length);
_r_primary[rpi] = true;
int rsi = (int)(hind.SecondReverse % (uint)_second.Length);
_r_second[rsi] = true;
int rti = (int)(hind.ThirdReverse % (uint)_third.Length);
_r_third[rti] = true;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool Contains(HIND hind)
{
int pi = (int)(hind.PrimiryDirect % (ulong)_primary.Length);
if (!_primary[pi]) return false;
int si = (int)(hind.SecondDirect % (uint)_second.Length);
if (!_second[si]) return false;
int ti = (int)(hind.ThirdDirect % (uint)_third.Length);
if (!_third[ti]) return false;
if (_use_reverse)
{
int rpi = (int)(hind.PrimiryReverse % (ulong)_primary.Length);
if (!_r_primary[rpi]) return false;
int rsi = (int)(hind.SecondReverse % (uint)_second.Length);
if (!_r_second[rsi]) return false;
int rti = (int)(hind.ThirdReverse % (uint)_third.Length);
if (!_r_third[rti]) return false;
}
return true;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private ulong HashUL(string line)
{
return XXH3_64.Hash(line);
}
private readonly XXHashUnsafe _hash_xx_32 = new XXHashUnsafe();
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint HashXX(string line)
{
return _hash_xx_32.Hash(line);
}
private readonly Murmur3Unsafe _hash_mm_32 = new Murmur3Unsafe();
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint HashMM(string line)
{
return _hash_mm_32.Hash(line);
}
}
}

@ -0,0 +1,49 @@
using System;
using System.Collections.Generic;
namespace ZeroLevel.DataStructures
{
public class HyperBloomBloom
{
private BloomFilter _trash;
private Dictionary<char, BloomFilter> _shardes = new Dictionary<char, BloomFilter>();
public HyperBloomBloom(int bit_size, bool use_reverse)
{
_trash = new BloomFilter(bit_size, use_reverse);
foreach (var ch in "abcdefghijklmnopqrstuvwxyz0123456789-")
{
_shardes.Add(ch, new BloomFilter(bit_size, use_reverse));
}
}
public void Add(string item)
{
if (item == null || item.Length == 0) return;
var k = Char.ToLowerInvariant(item[0]);
BloomFilter filter;
if (_shardes.TryGetValue(k, out filter) == false) filter = _trash;
filter.Add(item);
}
public bool Contains(string item)
{
if (item == null || item.Length == 0) return true;
var k = Char.ToLowerInvariant(item[0]);
BloomFilter filter;
if (_shardes.TryGetValue(k, out filter) == false) filter = _trash;
return filter.Contains(item);
}
/// <summary>
/// true if added, false if already exists
/// </summary>
public bool TryAdd(string item)
{
if (item == null || item.Length == 0) return false;
var k = Char.ToLowerInvariant(item[0]);
BloomFilter filter;
if (_shardes.TryGetValue(k, out filter) == false) filter = _trash;
return filter.TryAdd(item);
}
}
}

@ -3,7 +3,7 @@ using System.Collections.Generic;
using ZeroLevel.Models; using ZeroLevel.Models;
using ZeroLevel.Services.Serialization; using ZeroLevel.Services.Serialization;
namespace ZeroLevel.Services.Semantic.Helpers namespace ZeroLevel.DataStructures
{ {
public sealed class SparceVector public sealed class SparceVector
: IBinarySerializable : IBinarySerializable

@ -0,0 +1,257 @@
using System.Collections.Generic;
using System.Linq;
namespace ZeroLevel.DataStructures
{
/// <summary>
/// Represents a Sparse matrix.
/// </summary>
/// <typeparam name="T">The type of the stored value.</typeparam>
public class SparseMatrix<T>
{
/// <summary>
/// Dictionary containing the row index as a key and as a value another dictionary
/// containing the column index as a key and the stored value as a value.
/// </summary>
internal Dictionary<int, Dictionary<int, T>> rows;
/// <summary>
/// Dictionary containing the column index as a key and as a value another dictionary
/// containing the row index as a key and the stored value as a value.
/// </summary>
internal Dictionary<int, Dictionary<int, T>> cols;
/// <summary>
/// Gets the maximum reached height of the sparse matrix.
/// </summary>
public int Height { get; internal set; }
/// <summary>
/// Gets the maximum reached width of the sparse matrix.
/// </summary>
public int Width { get; internal set; }
/// <summary>
/// Gets the number of items in the <see cref="SparseMatrix{T}"/>.
/// </summary>
public int Count { get; internal set; }
/// <summary>
/// Gets or sets an item in the sparse matrix. If there is no item on the given position
/// on get the default value of T is returned and on set the item is added to the matrix.
/// </summary>
/// <param name="row">The zero-based row index of the item.</param>
/// <param name="col">The zero-based column index of the item.</param>
/// <returns>Returns the item in the sparse matrix. If there is no item on the given position the default value of T is returned instead.</returns>
public T this[int row, int col]
{
get
{
if (rows.ContainsKey(row))
{
if (rows[row].ContainsKey(col))
{
return rows[row][col];
}
}
//If there is no item on the given position return defaault value
return default(T);
}
set
{
if (row >= Height) Height = row + 1;
if (col >= Width) Width = col + 1;
//If no items on the current row we have to create a new dictionary
if (!rows.ContainsKey(row))
rows.Add(row, new Dictionary<int, T>());
//If no items on the current col we have to create a new dictionary
if (!cols.ContainsKey(col))
cols.Add(col, new Dictionary<int, T>());
rows[row][col] = value;
cols[col][row] = value;
Count++;
}
}
/// <summary>
/// Creates a new instance of the <see cref="SparseMatrix{T}"/> class.
/// </summary>
public SparseMatrix()
{
rows = new Dictionary<int, Dictionary<int, T>>();
cols = new Dictionary<int, Dictionary<int, T>>();
}
/// <summary>
/// Creates a new instance of the <see cref="SparseMatrix{T}"/> class from the given two dimensional array.
/// </summary>
/// <param name="array">The two dimensional array of items to add.</param>
/// <param name="zeroItem">The item considered a zero item. All items from the array equal to the zero item won't be added to the matrix.</param>
public SparseMatrix(T[,] array, T zeroItem)
{
rows = new Dictionary<int, Dictionary<int, T>>();
cols = new Dictionary<int, Dictionary<int, T>>();
for (int row = 0; row < array.GetLength(0); row++)
{
for (int col = 0; col < array.GetLength(1); col++)
{
if (!object.Equals(array[row, col], zeroItem))
this[row, col] = array[row, col];
}
}
}
/// <summary>
/// Determines if there is an item on the given position.
/// </summary>
/// <param name="row">The zero-based row index of the item.</param>
/// <param name="col">The zero-based column index of the item.</param>
/// <returns>Returns true if there is an item on the given position; otherwise false.</returns>
public bool IsCellEmpty(int row, int col)
{
if (rows.ContainsKey(row))
{
if (rows[row].ContainsKey(col))
{
return false;
}
}
return true;
}
/// <summary>
/// Gets the items in the given row sorted by the column index as an <see cref="IEnumerable{T}"/>
/// of <see cref="KeyValuePair{TKey, TValue}"/> with the key being the column index and the value being the item.
/// </summary>
/// <param name="row">The zero-based row index.</param>
/// <returns>Returns an <see cref="IEnumerable{T}"/> of <see cref="KeyValuePair{TKey, TValue}"/>
/// with the key being the column index and the value being the item.</returns>
public IEnumerable<KeyValuePair<int, T>> GetRowItems(int row)
{
if (rows.ContainsKey(row))
{
var sortedDict = new SortedDictionary<int, T>(rows[row]);
foreach (var item in sortedDict)
{
yield return item;
}
}
}
/// <summary>
/// Gets the items in the given column sorted by the row index as an <see cref="IEnumerable{T}"/>
/// of <see cref="KeyValuePair{TKey, TValue}"/> with the key being the row index and the value being the item.
/// </summary>
/// <param name="col">The zero-based column index.</param>
/// <returns>Returns an <see cref="IEnumerable{T}"/> of <see cref="KeyValuePair{TKey, TValue}"/>
/// with the key being the row index and the value being the item.</returns>
public IEnumerable<KeyValuePair<int, T>> GetColumnItems(int col)
{
if (cols.ContainsKey(col))
{
var sortedDict = new SortedDictionary<int, T>(cols[col]);
foreach (var item in sortedDict)
{
yield return item;
}
}
}
/// <summary>
/// Gets non empty rows indexes sorted in ascending order.
/// </summary>
/// <returns>Returns an <see cref="IEnumerable{T}"/> of integers being row indexes sorted in ascending order.</returns>
public IEnumerable<int> GetNonEmptyRows()
{
var sortedRows = new SortedSet<int>(rows.Keys);
foreach (var row in sortedRows)
{
yield return row;
}
}
/// <summary>
/// Gets non empty columns indexes sorted in ascending order.
/// </summary>
/// <returns>Returns an <see cref="IEnumerable{T}"/> of integers being column indexes sorted in ascending order.</returns>
public IEnumerable<int> GetNonEmptyColumns()
{
var sortedCols = new SortedSet<int>(cols.Keys);
foreach (var col in sortedCols)
{
yield return col;
}
}
/// <summary>
/// Removes the item on the given position.
/// </summary>
/// <param name="row">The zero-based row index.</param>
/// <param name="col">The zero-based column index.</param>
/// <returns>Returns true if item is removed successfully; otherwise false. Also returns false if the item is not found.</returns>
public bool Remove(int row, int col)
{
if (rows.ContainsKey(row))
{
if (rows[row].ContainsKey(col))
{
bool removedSuccessfully = true;
if (!rows[row].Remove(col) || !cols[col].Remove(row)) removedSuccessfully = false;
if (rows[row].Count == 0)
{
rows.Remove(row);
}
if (cols[col].Count == 0)
{
cols.Remove(col);
}
if (removedSuccessfully)
Count--;
return removedSuccessfully;
}
}
return false;
}
/// <summary>
/// Removes all elements from the sparse matrix.
/// </summary>
public void Clear()
{
rows.Clear();
cols.Clear();
Count = 0;
Height = 0;
Width = 0;
}
/// <summary>
/// Updates the height and the width of the matrix. If no items were removed from the matrix the dimensions will be correct.
/// </summary>
public void UpdateDimensions()
{
if (rows.Count == 0)
{
Height = 0;
Width = 0;
return;
}
Height = rows.Keys.Max() + 1;
Width = cols.Keys.Max() + 1;
}
}
}

@ -0,0 +1,636 @@
using System;
using System.Collections;
using System.Collections.Concurrent;
using System.Diagnostics.Contracts;
using System.Threading;
using System.Threading.Tasks;
namespace ZeroLevel.Collections
{
// A vector of bits. Use this to store bits efficiently, without having to do bit
// shifting yourself.
[System.Runtime.InteropServices.ComVisible(true)]
[Serializable()]
public class FastBitArray : ICollection, ICloneable
{
private FastBitArray()
{
}
/*=========================================================================
** Allocates space to hold length bit values. All of the values in the bit
** array are set to false.
** Exceptions: ArgumentException if length < 0.
=========================================================================*/
public FastBitArray(int length)
: this(length, false)
{
}
/*=========================================================================
** Allocates space to hold length bit values. All of the values in the bit
** array are set to defaultValue.
**
** Exceptions: ArgumentOutOfRangeException if length < 0.
=========================================================================*/
public FastBitArray(int length, bool defaultValue)
{
if (length < 0)
{
throw new ArgumentOutOfRangeException(nameof(length), "The length should be at least zero.");
}
Contract.EndContractBlock();
m_array = new int[GetArrayLength(length, BitsPerInt32)];
m_length = length;
int fillValue = defaultValue ? unchecked(((int)0xffffffff)) : 0;
for (int i = 0; i < m_array.Length; i++)
{
m_array[i] = fillValue;
}
_version = 0;
}
/*=========================================================================
** Allocates space to hold the bit values in bytes. bytes[0] represents
** bits 0 - 7, bytes[1] represents bits 8 - 15, etc. The LSB of each byte
** represents the lowest index value; bytes[0] & 1 represents bit 0,
** bytes[0] & 2 represents bit 1, bytes[0] & 4 represents bit 2, etc.
** Exceptions: ArgumentException if bytes == null.
=========================================================================*/
public FastBitArray(byte[] bytes)
{
if (bytes == null)
{
throw new ArgumentNullException(nameof(bytes));
}
Contract.EndContractBlock();
// this value is chosen to prevent overflow when computing m_length.
// m_length is of type int32 and is exposed as a property, so
// type of m_length can't be changed to accommodate.
if (bytes.Length > Int32.MaxValue / BitsPerByte)
{
throw new ArgumentException($"The array is too large {BitsPerByte}", nameof(bytes));
}
m_array = new int[GetArrayLength(bytes.Length, BytesPerInt32)];
m_length = bytes.Length * BitsPerByte;
Parallel
.ForEach(
Partitioner.Create(0, m_array.Length),
(range, state) =>
{
for (var i = range.Item1; i < range.Item2; i++)
{
var idx = i * BytesPerInt32;
if (idx + 3 < bytes.Length)
{
m_array[i] = (bytes[idx] & 0xff) |
((bytes[idx + 1] & 0xff) << 8) |
((bytes[idx + 2] & 0xff) << 16) |
((bytes[idx + 3] & 0xff) << 24);
}
}
});
var j = Math.Max(0, bytes.Length - (bytes.Length % BytesPerInt32));
var last = Math.Max(0, m_array.Length - 1);
switch (bytes.Length - j)
{
case 3:
m_array[last] = ((bytes[j + 2] & 0xff) << 16);
goto case 2;
// fall through
case 2:
m_array[last] |= ((bytes[j + 1] & 0xff) << 8);
goto case 1;
// fall through
case 1:
m_array[last] |= (bytes[j] & 0xff);
break;
}
_version = 0;
}
public FastBitArray(bool[] values)
{
SetValues(values);
_version = 0;
}
/*=========================================================================
** Allocates space to hold the bit values in values. values[0] represents
** bits 0 - 31, values[1] represents bits 32 - 63, etc. The LSB of each
** integer represents the lowest index value; values[0] & 1 represents bit
** 0, values[0] & 2 represents bit 1, values[0] & 4 represents bit 2, etc.
** Exceptions: ArgumentException if values == null.
=========================================================================*/
public FastBitArray(int[] values)
{
SetValues(values);
_version = 0;
}
private void SetValues(int[] values)
{
if (values == null)
{
throw new ArgumentNullException(nameof(values));
}
Contract.EndContractBlock();
// this value is chosen to prevent overflow when computing m_length
if (values.Length > Int32.MaxValue / BitsPerInt32)
{
throw new ArgumentException($"The array is too large: {BitsPerInt32}", nameof(values));
}
m_array = new int[values.Length];
m_length = values.Length * BitsPerInt32;
Array.Copy(values, m_array, values.Length);
}
/*=========================================================================
** Allocates a new BitArray with the same length and bit values as bits.
**
** Exceptions: ArgumentException if bits == null.
=========================================================================*/
public FastBitArray(FastBitArray bits)
{
if (bits == null)
{
throw new ArgumentNullException(nameof(bits));
}
Contract.EndContractBlock();
int arrayLength = GetArrayLength(bits.m_length, BitsPerInt32);
m_array = new int[arrayLength];
m_length = bits.m_length;
Array.Copy(bits.m_array, m_array, arrayLength);
_version = bits._version;
}
public bool this[int index]
{
get
{
return Get(index);
}
set
{
Set(index, value);
}
}
private void SetValues(bool[] values)
{
if (values == null)
{
throw new ArgumentNullException(nameof(values));
}
Contract.EndContractBlock();
m_array = new int[GetArrayLength(values.Length, BitsPerInt32)];
m_length = values.Length;
Parallel
.ForEach(
Partitioner.Create(0, m_array.Length),
(range, state) =>
{
for (var i = range.Item1; i < range.Item2; i++)
{
var idx = i * BitsPerInt32;
for (int j = 0; j < BitsPerInt32 && idx < values.Length; j++, idx++)
{
if (values[idx])
{
m_array[i] |= (1 << j);
}
}
}
});
}
/*=========================================================================
** Returns the bit value at position index.
** Exceptions: ArgumentOutOfRangeException if index < 0 or
** index >= GetLength().
=========================================================================*/
public bool Get(int index)
{
if (index < 0 || index >= Length)
{
throw new ArgumentOutOfRangeException(nameof(index), "Index is out of range");
}
Contract.EndContractBlock();
return (m_array[index / 32] & (1 << (index % 32))) != 0;
}
/*=========================================================================
** Sets the bit value at position index to value.
** Exceptions: ArgumentOutOfRangeException if index < 0 or
** index >= GetLength().
=========================================================================*/
public void Set(int index, bool value)
{
if (index < 0 || index >= Length)
{
throw new ArgumentOutOfRangeException(nameof(index), "Index is out of range");
}
Contract.EndContractBlock();
if (value)
{
m_array[index / 32] |= (1 << (index % 32));
}
else
{
m_array[index / 32] &= ~(1 << (index % 32));
}
_version++;
}
/*=========================================================================
** Sets all the bit values to value.
=========================================================================*/
public void SetAll(bool value)
{
int fillValue = value ? unchecked(((int)0xffffffff)) : 0;
Parallel
.ForEach(
Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)),
(range, state) =>
{
for (var i = range.Item1; i < range.Item2; i++)
{
m_array[i] = fillValue;
}
});
Interlocked.Increment(ref _version);
}
/*=========================================================================
** Returns a reference to the current instance ANDed with value.
** Exceptions: ArgumentException if value == null or
** value.Length != this.Length.
=========================================================================*/
public FastBitArray And(FastBitArray value)
{
if (value == null)
throw new ArgumentNullException(nameof(value));
if (Length != value.Length)
throw new ArgumentException("The array lengths differ.");
Contract.EndContractBlock();
Parallel
.ForEach(
Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)),
(range, state) =>
{
for (var i = range.Item1; i < range.Item2; i++)
{
m_array[i] &= value.m_array[i];
}
});
Interlocked.Increment(ref _version);
return this;
}
/*=========================================================================
** Returns a reference to the current instance ORed with value.
** Exceptions: ArgumentException if value == null or
** value.Length != this.Length.
=========================================================================*/
public FastBitArray Or(FastBitArray value)
{
if (value == null)
throw new ArgumentNullException(nameof(value));
if (Length != value.Length)
throw new ArgumentException("The array lengths differ");
Contract.EndContractBlock();
Parallel
.ForEach(
Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)),
(range, state) =>
{
for (var i = range.Item1; i < range.Item2; i++)
{
m_array[i] |= value.m_array[i];
}
});
Interlocked.Increment(ref _version);
return this;
}
public FastBitArray Fold(uint factor, bool inPlace)
{
if (factor <= 0)
throw new ArgumentException($"Fold factor should be a positive number (given value was {factor}.");
if (Length % factor != 0)
{
throw new ArgumentException(
$"Fast bit array of size {Length} cannot be folded by a factor {factor}.", nameof(factor));
}
Contract.EndContractBlock();
if (factor == 1) return this;
int newLength = (int)(Length / factor);
int arrayLength = GetArrayLength(newLength, BitsPerInt32);
var newValues = new int[arrayLength];
Parallel
.ForEach(
Partitioner.Create(0, arrayLength),
(range, state) =>
{
for (var i = range.Item1; i < range.Item2; i++)
{
var idx = i * BitsPerInt32;
for (var j = 0; j < BitsPerInt32 && idx < newLength; j++, idx++)
{
if (GetFolded(this, idx, factor, newLength))
{
newValues[i] |= (1 << j);
}
}
}
});
if (!inPlace)
{
var res = new FastBitArray(newValues);
res.m_length = newLength;
return res;
}
SetValues(newValues);
m_length = newLength;
Interlocked.Increment(ref _version);
return this;
}
private static bool GetFolded(FastBitArray bitArray, int position, uint foldFactor, int foldedSize)
{
if (foldFactor == 1) return bitArray[position];
for (var i = 0; i < foldFactor; i++)
{
if (bitArray.Get(position + i * foldedSize))
return true;
}
return false;
}
/*=========================================================================
** Returns a reference to the current instance XORed with value.
** Exceptions: ArgumentException if value == null or
** value.Length != this.Length.
=========================================================================*/
public FastBitArray Xor(FastBitArray value)
{
if (value == null)
throw new ArgumentNullException(nameof(value));
if (Length != value.Length)
throw new ArgumentException("The array lengths differ");
Contract.EndContractBlock();
Parallel.ForEach(
Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)),
(range, state) =>
{
for (var i = range.Item1; i < range.Item2; i++)
{
m_array[i] ^= value.m_array[i];
}
});
Interlocked.Increment(ref _version);
return this;
}
/*=========================================================================
** Inverts all the bit values. On/true bit values are converted to
** off/false. Off/false bit values are turned on/true. The current instance
** is updated and returned.
=========================================================================*/
public FastBitArray Not()
{
Parallel.ForEach(
Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)),
(range, state) =>
{
for (var i = range.Item1; i < range.Item2; i++)
{
m_array[i] = ~m_array[i];
}
});
Interlocked.Increment(ref _version);
return this;
}
public int Length
{
get
{
Contract.Ensures(Contract.Result<int>() >= 0);
return m_length;
}
set
{
if (value < 0)
{
throw new ArgumentOutOfRangeException(nameof(value), "The length cannot be less than 0.");
}
Contract.EndContractBlock();
int newints = GetArrayLength(value, BitsPerInt32);
if (newints > m_array.Length || newints + _ShrinkThreshold < m_array.Length)
{
// grow or shrink (if wasting more than _ShrinkThreshold ints)
int[] newarray = new int[newints];
Array.Copy(m_array, newarray, newints > m_array.Length ? m_array.Length : newints);
m_array = newarray;
}
if (value > m_length)
{
// clear high bit values in the last int
int last = GetArrayLength(m_length, BitsPerInt32) - 1;
int bits = m_length % 32;
if (bits > 0)
{
m_array[last] &= (1 << bits) - 1;
}
// clear remaining int values
Array.Clear(m_array, last + 1, newints - last - 1);
}
m_length = value;
_version++;
}
}
// ICollection implementation
public void CopyTo(Array array, int index)
{
if (array == null)
throw new ArgumentNullException(nameof(array));
if (index < 0)
throw new ArgumentOutOfRangeException(nameof(index), "The index cannot be less than 0.");
if (array.Rank != 1)
throw new ArgumentException("Multi dimensional arrays are not supported.");
Contract.EndContractBlock();
if (array is int[])
{
Array.Copy(m_array, 0, array, index, GetArrayLength(m_length, BitsPerInt32));
}
else if (array is byte[])
{
int arrayLength = GetArrayLength(m_length, BitsPerByte);
if ((array.Length - index) < arrayLength)
throw new ArgumentException("The offset is invalid.");
byte[] b = (byte[])array;
for (int i = 0; i < arrayLength; i++)
b[index + i] = (byte)((m_array[i / 4] >> ((i % 4) * 8)) & 0x000000FF); // Shift to bring the required byte to LSB, then mask
}
else if (array is bool[])
{
if (array.Length - index < m_length)
throw new ArgumentException("The offset is invalid.");
bool[] b = (bool[])array;
for (int i = 0; i < m_length; i++)
b[index + i] = ((m_array[i / 32] >> (i % 32)) & 0x00000001) != 0;
}
else
throw new ArgumentException("The offset is invalid.");
}
public int Count
{
get
{
Contract.Ensures(Contract.Result<int>() >= 0);
return m_length;
}
}
public Object Clone()
{
Contract.Ensures(Contract.Result<Object>() != null);
Contract.Ensures(((FastBitArray)Contract.Result<Object>()).Length == this.Length);
return new FastBitArray(this);
}
public Object SyncRoot
{
get
{
if (_syncRoot == null)
{
System.Threading.Interlocked.CompareExchange<Object>(ref _syncRoot, new Object(), null);
}
return _syncRoot;
}
}
public bool IsReadOnly
{
get
{
return false;
}
}
public bool IsSynchronized
{
get
{
return false;
}
}
public IEnumerator GetEnumerator()
{
return new BitArrayEnumeratorSimple(this);
}
// XPerY=n means that n Xs can be stored in 1 Y.
private const int BitsPerInt32 = 32;
private const int BytesPerInt32 = 4;
private const int BitsPerByte = 8;
/// <summary>
/// Used for conversion between different representations of bit array.
/// Returns (n+(div-1))/div, rearranged to avoid arithmetic overflow.
/// For example, in the bit to int case, the straightforward calc would
/// be (n+31)/32, but that would cause overflow. So instead it's
/// rearranged to ((n-1)/32) + 1, with special casing for 0.
///
/// Usage:
/// GetArrayLength(77, BitsPerInt32): returns how many ints must be
/// allocated to store 77 bits.
/// </summary>
/// <param name="n"></param>
/// <param name="div">use a conversion constant, e.g. BytesPerInt32 to get
/// how many ints are required to store n bytes</param>
/// <returns></returns>
private static int GetArrayLength(int n, int div)
{
Contract.Assert(div > 0, "GetArrayLength: div arg must be greater than 0");
return n > 0 ? (((n - 1) / div) + 1) : 0;
}
[Serializable]
private class BitArrayEnumeratorSimple : IEnumerator, ICloneable
{
private FastBitArray bitarray;
private int index;
private int version;
private bool currentElement;
internal BitArrayEnumeratorSimple(FastBitArray bitarray)
{
this.bitarray = bitarray;
this.index = -1;
version = bitarray._version;
}
public Object Clone()
{
return MemberwiseClone();
}
public virtual bool MoveNext()
{
if (version != bitarray._version) throw new InvalidOperationException("The version changed during enumeration");
if (index < (bitarray.Count - 1))
{
index++;
currentElement = bitarray.Get(index);
return true;
}
else
index = bitarray.Count;
return false;
}
public virtual Object Current
{
get
{
if (index == -1)
throw new InvalidOperationException("Enumeration was not started");
if (index >= bitarray.Count)
throw new InvalidOperationException("Enumeration was ended");
return currentElement;
}
}
public void Reset()
{
if (version != bitarray._version) throw new InvalidOperationException("The bit array was modified during enumeration.");
index = -1;
}
}
private int[] m_array;
private int m_length;
private int _version;
[NonSerialized]
private Object _syncRoot;
private const int _ShrinkThreshold = 256;
}
}

@ -0,0 +1,23 @@
using System;
using ZeroLevel.Collections;
namespace ZeroLevel.Extensions
{
internal static class BitArrayExtensions
{
// <summary>
// serialize a bitarray.
// </summary>
//<param name="bits">The bit array to convert</param>
// <returns>The bit array converted to an array of bytes.</returns>
internal static byte[] ToBytes(this FastBitArray bits)
{
if (bits == null) return null;
var numBytes = bits.Count / 8;
if (bits.Count % 8 != 0) numBytes++;
var bytes = new byte[numBytes];
bits.CopyTo(bytes, 0);
return bytes;
}
}
}

@ -0,0 +1,11 @@
using System;
namespace ZeroLevel.Services.HashFunctions
{
public interface IHash
{
uint Hash(string s);
uint Hash(byte[] data);
uint Hash(byte[] data, int offset, uint len, uint seed);
}
}

@ -0,0 +1,99 @@
using System;
namespace ZeroLevel.Services.HashFunctions
{
public class Murmur3Unsafe
: IHash
{
private const uint Seed = 0xc58f1a7b;
private const UInt32 c1 = 0xcc9e2d51;
private const UInt32 c2 = 0x1b873593;
public unsafe UInt32 Hash(string s)
{
fixed (char* input = s)
{
return Hash((byte*)input, (uint)s.Length * sizeof(char), Seed);
}
}
public unsafe uint Hash(byte[] data)
{
fixed (byte* input = &data[0])
{
return Hash(input, (uint)data.Length, Seed);
}
}
public unsafe uint Hash(byte[] data, int offset, uint len, uint seed)
{
fixed (byte* input = &data[offset])
{
return Hash(input, len, seed);
}
}
private unsafe static uint Hash(byte* data, uint len, uint seed)
{
UInt32 nblocks = len / 4;
UInt32 h1 = seed;
//----------
// body
UInt32 k1;
UInt32* block = (UInt32*)data;
for (UInt32 i = nblocks; i > 0; --i, ++block)
{
k1 = *block;
k1 *= c1;
k1 = Rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
h1 = Rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
}
//----------
// tail
k1 = 0;
uint rem = len & 3;
byte* tail = (byte*)block;
if (rem >= 3)
k1 ^= (uint)(tail[2] << 16);
if (rem >= 2)
k1 ^= (uint)(tail[1] << 8);
if (rem > 0)
{
k1 ^= tail[0];
k1 *= c1;
k1 = Rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
}
//----------
// finalization
h1 ^= len;
h1 ^= h1 >> 16;
h1 *= 0x85ebca6b;
h1 ^= h1 >> 13;
h1 *= 0xc2b2ae35;
h1 ^= h1 >> 16;
return h1;
}
private static UInt32 Rotl32(UInt32 x, int r)
{
return (x << r) | (x >> (32 - r));
}
}
}

@ -0,0 +1,143 @@
using System;
using System.Buffers.Binary;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace ZeroLevel.Services.HashFunctions
{
internal static class Utils
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ReadOnlySpan<TTo> PopAll<TTo>(this ref ReadOnlySpan<byte> @this) where TTo : struct
{
#if NETCOREAPP3_0
var totBytes = @this.Length;
var toLength = (totBytes / Unsafe.SizeOf<TTo>());
var sliceLength = toLength * Unsafe.SizeOf<TTo>();
ref var thisRef = ref MemoryMarshal.GetReference(@this);
@this = MemoryMarshal.CreateReadOnlySpan(ref Unsafe.Add(ref thisRef, sliceLength), totBytes - sliceLength);
return MemoryMarshal.CreateReadOnlySpan(ref Unsafe.As<byte, TTo>(ref thisRef), toLength);
#else
return @this.PopAll<TTo, byte>();
#endif
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ReadOnlySpan<TTo> PopAll<TTo, TFrom>(this ref ReadOnlySpan<TFrom> @this) where TFrom : struct where TTo : struct
{
var totBytes = @this.Length * Unsafe.SizeOf<TFrom>();
var toLength = (totBytes / Unsafe.SizeOf<TTo>());
var sliceLength = toLength * Unsafe.SizeOf<TTo>() / Unsafe.SizeOf<TFrom>();
#if NETSTANDARD2_0
var result = MemoryMarshal.Cast<TFrom, TTo>(@this);
#else
var result = MemoryMarshal.CreateReadOnlySpan(ref Unsafe.As<TFrom, TTo>(ref MemoryMarshal.GetReference(@this)), toLength);
#endif
@this = @this.Slice(sliceLength);
return result;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint AsLittleEndian(this uint @this)
{
if (BitConverter.IsLittleEndian) { return @this; }
return BinaryPrimitives.ReverseEndianness(@this);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong AsLittleEndian(this ulong @this)
{
if (BitConverter.IsLittleEndian) { return @this; }
return BinaryPrimitives.ReverseEndianness(@this);
}
public static bool TryPop<TTo>(this ref ReadOnlySpan<byte> @this, int count, out ReadOnlySpan<TTo> popped) where TTo : struct
{
var byteCount = count * Unsafe.SizeOf<TTo>();
if (@this.Length >= byteCount)
{
popped = MemoryMarshal.Cast<byte, TTo>(@this.Slice(0, byteCount));
@this = @this.Slice(byteCount);
return true;
}
popped = default;
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ref readonly TTo First<TTo>(this ReadOnlySpan<byte> @this) where TTo : struct
{
return ref MemoryMarshal.Cast<byte, TTo>(@this)[0];
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ref readonly TTo Last<TTo>(this ReadOnlySpan<byte> @this) where TTo : struct
{
return ref MemoryMarshal.Cast<byte, TTo>(@this.Slice(@this.Length - Unsafe.SizeOf<TTo>()))[0];
}
public static ref readonly TTo First<TFrom, TTo>(this ReadOnlySpan<TFrom> @this) where TTo : struct where TFrom : struct
{
#if NETSTANDARD2_0
return ref MemoryMarshal.Cast<TFrom, TTo>(@this)[0];
#else
//TODO: is this version actually any faster/better at all?
return ref MemoryMarshal.AsRef<TTo>(MemoryMarshal.AsBytes(@this));
#endif
}
}
public static class Safeish
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ref readonly TTo As<TFrom, TTo>(in TFrom from) where TTo : struct where TFrom : struct
{
if (Unsafe.SizeOf<TFrom>() < Unsafe.SizeOf<TTo>()) { throw new InvalidCastException(); }
return ref Unsafe.As<TFrom, TTo>(ref Unsafe.AsRef(from));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ref TTo AsMut<TFrom, TTo>(ref TFrom from) where TTo : struct where TFrom : struct
{
if (Unsafe.SizeOf<TFrom>() < Unsafe.SizeOf<TTo>()) { throw new InvalidCastException(); }
return ref Unsafe.As<TFrom, TTo>(ref from);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ReadOnlySpan<TTo> AsSpan<TFrom, TTo>(in TFrom from) where TTo : struct where TFrom : struct
{
#if NETSTANDARD2_0
var asSpan = CreateReadOnlySpan(ref Unsafe.AsRef(from));
#else
var asSpan = MemoryMarshal.CreateReadOnlySpan(ref Unsafe.AsRef(from), 1);
#endif
return MemoryMarshal.Cast<TFrom, TTo>(asSpan);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Span<TTo> AsMutableSpan<TFrom, TTo>(ref TFrom from) where TTo : struct where TFrom : struct
{
#if NETSTANDARD2_0
var asSpan = CreateSpan(ref Unsafe.AsRef(from));
#else
var asSpan = MemoryMarshal.CreateSpan(ref from, 1);
#endif
return MemoryMarshal.Cast<TFrom, TTo>(asSpan);
}
#if NETSTANDARD2_0
private static unsafe Span<T> CreateSpan<T>(ref T from) where T : struct
{
void* ptr = Unsafe.AsPointer(ref from);
return new Span<T>(ptr, 1);
}
private static unsafe ReadOnlySpan<T> CreateReadOnlySpan<T>(ref T from) where T : struct
{
void* ptr = Unsafe.AsPointer(ref from);
return new ReadOnlySpan<T>(ptr, 1);
}
#endif
}
}

@ -0,0 +1,145 @@
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
namespace ZeroLevel.Services.HashFunctions
{
public static class XXH3_64
{
private const ulong PRIME64_1 = 11400714785074694791UL;
private const ulong PRIME64_2 = 14029467366897019727UL;
private const ulong PRIME64_3 = 1609587929392839161UL;
private const ulong PRIME64_4 = 9650029242287828579UL;
private const ulong PRIME64_5 = 2870177450012600261UL;
[StructLayout(LayoutKind.Sequential)]
private struct QuadUlong
{
public ulong v1;
public ulong v2;
public ulong v3;
public ulong v4;
}
public static ulong Hash(string line)
{
return Hash(new ReadOnlySpan<byte>(Encoding.UTF8.GetBytes(line)));
}
public static ulong Hash(in ReadOnlySpan<byte> buffer)
{
unchecked
{
var remainingBytes = buffer;
var bulkVals = remainingBytes.PopAll<QuadUlong>();
var h64 = !bulkVals.IsEmpty ? BulkStride(bulkVals) : PRIME64_5;
h64 += (uint)buffer.Length;
var ulongSpan = remainingBytes.PopAll<ulong>();
for (int i = 0; i < ulongSpan.Length; i++)
{
var val = ulongSpan[i] * PRIME64_2;
val = RotateLeft(val, 31);
val *= PRIME64_1;
h64 ^= val;
h64 = RotateLeft(h64, 27) * PRIME64_1;
h64 += PRIME64_4;
}
ref byte remaining = ref MemoryMarshal.GetReference(remainingBytes);
if (remainingBytes.Length >= sizeof(uint))
{
h64 ^= Unsafe.As<byte, uint>(ref remaining) * PRIME64_1;
h64 = RotateLeft(h64, 23) * PRIME64_2;
h64 += PRIME64_3;
remaining = ref Unsafe.Add(ref remaining, sizeof(uint));
}
switch (remainingBytes.Length % sizeof(uint))
{
case 3:
h64 = RotateLeft(h64 ^ remaining * PRIME64_5, 11) * PRIME64_1;
remaining = ref Unsafe.Add(ref remaining, 1);
goto case 2;
case 2:
h64 = RotateLeft(h64 ^ remaining * PRIME64_5, 11) * PRIME64_1;
remaining = ref Unsafe.Add(ref remaining, 1);
goto case 1;
case 1:
h64 = RotateLeft(h64 ^ remaining * PRIME64_5, 11) * PRIME64_1;
break;
}
h64 ^= h64 >> 33;
h64 *= PRIME64_2;
h64 ^= h64 >> 29;
h64 *= PRIME64_3;
h64 ^= h64 >> 32;
return h64;
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
private static ulong BulkStride(in ReadOnlySpan<QuadUlong> bulkVals)
{
unchecked
{
ulong acc1 = 0 + PRIME64_1 + PRIME64_2;
ulong acc2 = 0 + PRIME64_2;
ulong acc3 = 0 + 0;
ulong acc4 = 0 - PRIME64_1;
for (int i = 0; i < bulkVals.Length; i++)
{
ref readonly QuadUlong val = ref bulkVals[i];
acc1 += val.v1 * PRIME64_2;
acc2 += val.v2 * PRIME64_2;
acc3 += val.v3 * PRIME64_2;
acc4 += val.v4 * PRIME64_2;
acc1 = RotateLeft(acc1, 31);
acc2 = RotateLeft(acc2, 31);
acc3 = RotateLeft(acc3, 31);
acc4 = RotateLeft(acc4, 31);
acc1 *= PRIME64_1;
acc2 *= PRIME64_1;
acc3 *= PRIME64_1;
acc4 *= PRIME64_1;
}
return MergeValues(acc1, acc2, acc3, acc4);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ulong RotateLeft(ulong val, int bits) => (val << bits) | (val >> (64 - bits));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ulong MergeValues(ulong v1, ulong v2, ulong v3, ulong v4)
{
var acc = RotateLeft(v1, 1) + RotateLeft(v2, 7) + RotateLeft(v3, 12) + RotateLeft(v4, 18);
acc = MergeAccumulator(acc, v1);
acc = MergeAccumulator(acc, v2);
acc = MergeAccumulator(acc, v3);
acc = MergeAccumulator(acc, v4);
return acc;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ulong MergeAccumulator(ulong accMain, ulong accN)
{
accN = (accN * PRIME64_2);
accN = RotateLeft(accN, 31);
accN = accN * PRIME64_1;
accMain ^= accN;
accMain *= PRIME64_1;
return accMain + PRIME64_4;
}
}
}

@ -0,0 +1,124 @@
using System;
namespace ZeroLevel.Services.HashFunctions
{
public class XXHashUnsafe
: IHash
{
private const uint Seed = 0xc58f1a7b;
private const uint PRIME1 = 2654435761U;
private const uint PRIME2 = 2246822519U;
private const uint PRIME3 = 3266489917U;
private const uint PRIME4 = 668265263U;
private const int PRIME5 = 0x165667b1;
private uint _bias;
public XXHashUnsafe(uint bias = 0) => _bias = bias;
public unsafe UInt32 Hash(string s)
{
fixed (char* input = s)
{
return Hash((byte*)input, (uint)s.Length * sizeof(char), Seed, _bias);
}
}
public unsafe uint Hash(byte[] data)
{
fixed (byte* input = &data[0])
{
return Hash(input, (uint)data.Length, Seed, _bias);
}
}
public unsafe uint Hash(byte[] data, int offset, uint len, uint seed)
{
fixed (byte* input = &data[offset])
{
return Hash(input, len, seed, _bias);
}
}
private unsafe static uint Hash(byte* data, uint len, uint seed, uint bias)
{
if (len < 16)
return HashSmall(data, len, seed, bias);
uint v1 = seed + PRIME1 + bias;
uint v2 = v1 * PRIME2 + len;
uint v3 = v2 * PRIME3;
uint v4 = v3 * PRIME4;
uint* p = (uint*)data;
uint* limit = (uint*)(data + len - 16);
while (p < limit)
{
v1 += Rotl32(v1, 13); v1 *= PRIME1; v1 += *p; p++;
v2 += Rotl32(v2, 11); v2 *= PRIME1; v2 += *p; p++;
v3 += Rotl32(v3, 17); v3 *= PRIME1; v3 += *p; p++;
v4 += Rotl32(v4, 19); v4 *= PRIME1; v4 += *p; p++;
}
p = limit;
v1 += Rotl32(v1, 17); v2 += Rotl32(v2, 19); v3 += Rotl32(v3, 13); v4 += Rotl32(v4, 11);
v1 *= PRIME1; v2 *= PRIME1; v3 *= PRIME1; v4 *= PRIME1;
v1 += *p; p++; v2 += *p; p++; v3 += *p; p++; v4 += *p;
v1 *= PRIME2; v2 *= PRIME2; v3 *= PRIME2; v4 *= PRIME2;
v1 += Rotl32(v1, 11); v2 += Rotl32(v2, 17); v3 += Rotl32(v3, 19); v4 += Rotl32(v4, 13);
v1 *= PRIME3; v2 *= PRIME3; v3 *= PRIME3; v4 *= PRIME3;
uint crc = v1 + Rotl32(v2, 3) + Rotl32(v3, 6) + Rotl32(v4, 9);
crc ^= crc >> 11;
crc += (PRIME4 + len) * PRIME1;
crc ^= crc >> 15;
crc *= PRIME2;
crc ^= crc >> 13;
return crc;
}
private unsafe static uint HashSmall(byte* data, uint len, uint seed, uint bias)
{
byte* p = data;
byte* bEnd = data + len;
byte* limit = bEnd - 4;
uint idx = seed + PRIME1 + bias;
uint crc = PRIME5;
while (p < limit)
{
crc += (*(uint*)p) + idx;
idx++;
crc += Rotl32(crc, 17) * PRIME4;
crc *= PRIME1;
p += 4;
}
while (p < bEnd)
{
crc += (*p) + idx;
idx++;
crc *= PRIME1;
p++;
}
crc += len;
crc ^= crc >> 15;
crc *= PRIME2;
crc ^= crc >> 13;
crc *= PRIME3;
crc ^= crc >> 16;
return crc;
}
private static UInt32 Rotl32(UInt32 x, int r)
{
return (x << r) | (x >> (32 - r));
}
}
}

@ -81,3 +81,4 @@ namespace ZeroLevel.Services.Semantic.CValue
this.chunkerTag = chunkerTag; this.chunkerTag = chunkerTag;
} }
} }
}

@ -12,7 +12,7 @@ namespace ZeroLevel.Services.Semantic.Fasttext
public entry_type type; public entry_type type;
public List<int> subwords; public List<int> subwords;
} }
/*
internal class FTDictionary internal class FTDictionary
{ {
const int MAX_VOCAB_SIZE = 30000000; const int MAX_VOCAB_SIZE = 30000000;
@ -484,4 +484,5 @@ namespace ZeroLevel.Services.Semantic.Fasttext
return ntokens; return ntokens;
} }
} }
*/
} }

@ -4,6 +4,7 @@ using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Threading; using System.Threading;
using System.Threading.Tasks; using System.Threading.Tasks;
using ZeroLevel.DataStructures;
using ZeroLevel.Implementation.Semantic.Helpers; using ZeroLevel.Implementation.Semantic.Helpers;
using ZeroLevel.Services.Serialization; using ZeroLevel.Services.Serialization;

@ -0,0 +1,155 @@
namespace ZeroLevel.Services.Semantic.Helpers
{
public static class TextDistance
{
private static int MinOf3(int a, int b, int c)
{
if (a < b)
{
if (b < c) return a;
if (c < a) return c;
else return a;
}
if (c < b) return c;
else return b;
}
/// <summary>
/// Computes the Levenshtein distance between two strings.
/// </summary>
/// <param name="s1">The first <see cref="string"/>.</param>
/// <param name="s2">The second <see cref="string"/>.</param>
/// <returns>The edit distiance between the given <see cref="string"/> objets.</returns>
public static int LevenshteinDistance(string s1, string s2)
{
// Null or empty checks
if (string.IsNullOrEmpty(s1))
{
if (string.IsNullOrEmpty(s2))
return 0;
else
return s2.Length;
}
if (string.IsNullOrEmpty(s2)) return s1.Length;
// Faster access
int s1Length = s1.Length;
int s2Length = s2.Length;
// Create two rows for computation. We don't need reconstruction so a full matrix isn't needed
var rows = new int[2][];
rows[0] = new int[s2Length + 1];
rows[1] = new int[s2Length + 1];
// Initialize first row
for (int i = 0; i <= s2Length; i++)
rows[0][i] = i;
// Row for computation
int curRow = 1;
for (int i = 0; i < s1Length; i++)
{
// Calculate first index in current row for computation
rows[curRow][0] = i + 1;
int prevRow = curRow ^ 1;
// Calculate rest of the row
for (int j = 1; j <= s2Length; j++)
{
int cost = s1[i] == s2[j - 1] ? 0 : 1;
rows[curRow][j] = MinOf3(
rows[prevRow][j] + 1, // deletion
rows[curRow][j - 1] + 1, // insertion
rows[prevRow][j - 1] + cost); // substitution
}
// Change row for computation to the next.
curRow = i & 1;
}
return rows[curRow ^ 1][s2Length];
}
/// <summary>
/// Computes the Damerau-Levenshtein distance between two strings.
/// </summary>
/// <param name="s1">The first <see cref="string"/>.</param>
/// <param name="s2">The second <see cref="string"/>.</param>
/// <returns>The edit distiance between the given <see cref="string"/> objets.</returns>
public static int DamerauLevenshteinDistance(string s1, string s2)
{
// Null or empty checks
if (string.IsNullOrEmpty(s1))
{
if (string.IsNullOrEmpty(s2))
return 0;
else
return s2.Length;
}
if (string.IsNullOrEmpty(s2)) return s1.Length;
// Faster access
int s1Length = s1.Length;
int s2Length = s2.Length;
// Create three rows for computation. We don't need reconstruction so a full matrix isn't needed
var rows = new int[3][];
rows[0] = new int[s2Length + 1];
rows[1] = new int[s2Length + 1];
rows[2] = new int[s2Length + 1];
// Initialize first row
for (int i = 0; i <= s2Length; i++)
rows[0][i] = i;
// Define rows
int transRow = -1;
int prevRow = 0;
int curRow = 1;
for (int i = 1; i <= s1Length; i++)
{
// Calculate first index in current row for computation
rows[curRow][0] = i;
// Calculate rest of the row
for (int j = 1; j <= s2Length; j++)
{
int cost = s1[i - 1] == s2[j - 1] ? 0 : 1;
rows[curRow][j] = MinOf3(
rows[prevRow][j] + 1, // deletion
rows[curRow][j - 1] + 1, // insertion
rows[prevRow][j - 1] + cost); // substitution
if (i > 1 && j > 1 && s1[i - 1] == s2[j - 2] && s1[i - 2] == s2[j - 1])
{
// Transposition
int curVal = rows[curRow][j];
int transVal = rows[transRow][j - 2] + cost;
rows[curRow][j] = curVal < transVal ? curVal : transVal;
}
}
// Update rows
switch (curRow)
{
case 0:
curRow = 1;
prevRow = 0;
transRow = 2;
break;
case 1:
curRow = 2;
prevRow = 1;
transRow = 0;
break;
case 2:
curRow = 0;
prevRow = 2;
transRow = 1;
break;
default:
break;
}
}
return rows[prevRow][s2Length];
}
}
}

@ -0,0 +1,85 @@
using System.Collections.Generic;
namespace ZeroLevel.Services.Semantic.Model
{
public class Symbol
{
internal static char[] _map_ind_ch = new char[64] { 'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', '.', ',', '!', '?' };
internal static Dictionary<char, byte> _map_ch_ind = new Dictionary<char, byte>(64)
{
{'а', 0}, {'б', 1}, {'в', 2}, {'г', 3}, {'д', 4},
{'е', 5}, {'ё', 6}, {'ж', 7}, {'з', 8}, {'и', 9}, // 10
{'й', 10}, {'к', 11}, {'л', 12}, {'м', 13}, {'н', 14},
{'о', 15}, {'п', 16}, {'р', 17}, {'с', 18}, {'т', 19}, // 20
{'у', 20}, {'ф', 21}, {'х', 22}, {'ц', 23}, {'ч', 24},
{'ш', 25}, {'щ', 26}, {'ъ', 27}, {'ы', 28}, {'ь', 29}, // 30
{'э', 30}, {'ю', 31}, {'я', 32},
{'a', 33}, {'b', 34}, {'c', 35}, {'d', 36}, {'e', 37}, // 38
{'f', 38}, {'g', 39}, {'h', 40}, {'i', 41}, {'j', 42},
{'k', 43}, {'l', 44}, {'m', 45}, {'n', 46}, {'o', 47}, // 48
{'p', 48}, {'q', 49}, {'r', 50}, {'s', 51}, {'t', 52},
{'u', 53}, {'v', 54}, {'w', 55}, {'x', 56}, {'y', 57}, // 58
{'z', 58 },
{ '-', 59}, {'.', 60}, {',', 61}, {'!', 62}, {'?', 63}
};
const byte TERMINATE_FLAG = 1;
const byte HAS_NEXT_FLAG = 2;
public static byte ToByte(char ch, bool is_leaf = false, bool has_next = false)
{
byte b = 0;
if (_map_ch_ind.TryGetValue(ch, out b))
{
b <<= 2;
if (is_leaf) b |= TERMINATE_FLAG;
if (has_next) b |= HAS_NEXT_FLAG;
}
return b;
}
public static bool IsTermiate(byte sym)
{
return (sym & TERMINATE_FLAG) == TERMINATE_FLAG;
}
public static bool IsLeaf(byte sym)
{
return (sym & HAS_NEXT_FLAG) == HAS_NEXT_FLAG;
}
public static char ToChar(byte sym)
{
var ind = sym >> 2;
if (ind >= 0 && ind < 64) return _map_ind_ch[ind];
return '\0';
}
}
public class Node
{
private byte Sym;
private Node NextSibling;
private Node NextChild;
public void Append(string line, int position)
{
}
}
public class Tree
{
private Node _root;
public void Append(string word)
{
_root.Append(word, 0);
}
}
}

@ -0,0 +1,171 @@
using System;
using System.Collections.Generic;
namespace ZeroLevel.Services.Semantic.Helpers
{
public static class BoyerMoore
{
/// <summary>
/// Makes the bad char table. Contains the distance between the last character of the pattern and the rightmost occurrence of the character.
/// </summary>
private static Dictionary<char, int> BuildBadCharacterTable(string pattern)
{
var badCharTable = new Dictionary<char, int>();
int patLength = pattern.Length;
for (int i = 0; i < patLength - 1; i++)
{
badCharTable[pattern[i]] = patLength - 1 - i;
}
return badCharTable;
}
/// <summary>
/// Searches for the first occurrence of a pattern in a target <see cref="string"/> using Boyer-Moore's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="pattern">The <see cref="string"/> to search for.</param>
/// <returns>Returns the position of the first occurrence of the pattern. If not found returns -1.</returns>
public static int BoyerMooreSearchFirst(string target, string pattern)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
// Build tables
var badCharTable = BuildBadCharacterTable(pattern);
// Faster access
int patternLength = pattern.Length;
int targetLength = target.Length;
int endOfSearch = targetLength - patternLength;
int i = 0;
while (i <= endOfSearch)
{
// Start mathing
int j = patternLength - 1;
while (j >= 0 && target[i + j] == pattern[j])
{
j--;
}
if (j < 0)
return i; // found a match
// If we didn't find a match advance to next position
int badChar = badCharTable.ContainsKey(target[i + j]) ? badCharTable[target[i + j]] : 0;
int offset = badChar - patternLength + 1 + j;
i += 1 < offset ? offset : 1;
}
// We haven't found anything
return -1;
}
/// <summary>
/// Searches for all occurences of a pattern in a target <see cref="string"/> using Boyer-Moore's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="pattern">The <see cref="string"/> to search for.</param>
/// <returns>Returns <see cref="IList{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs. <see cref="IList{T}"/> is empty if none found.</returns>
public static IList<int> BoyerMooreSearchAll(string target, string pattern)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
// List with matches
var matches = new List<int>();
// Build tables
var badCharTable = BuildBadCharacterTable(pattern);
// Faster access
int patternLength = pattern.Length;
int targetLength = target.Length;
int endOfSearch = targetLength - patternLength;
int i = 0;
while (i <= endOfSearch)
{
int j = patternLength - 1;
while (j >= 0 && target[i + j] == pattern[j])
{
j--;
}
if (j < 0)
{
matches.Add(i); // found a match
// Compute next position to start matching again
if (i + patternLength < targetLength)
{
int badChar = badCharTable.ContainsKey(target[i + patternLength]) ? badCharTable[target[i + patternLength]] : 0;
i += badChar + 1;
}
else i++;
}
else
{
// If we didn't find a match advance to next position
int badChar = badCharTable.ContainsKey(target[i + j]) ? badCharTable[target[i + j]] : 0;
int offset = badChar - patternLength + 1 + j;
i += 1 < offset ? offset : 1;
}
}
return matches;
}
/// <summary>
/// Searches for the first occurrence of multiple patterns in a target <see cref="string"/> using Boyer-Moore's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="int"/> values of the position of first occurence.
/// If a pattern is not found there is no entry in the dictionary.</returns>
public static Dictionary<string, int> BoyerMooreMultipleSearchFirst(string target, IList<string> patterns)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (patterns == null) throw new ArgumentNullException(nameof(patterns));
// Dictionary with matches
var matches = new Dictionary<string, int>();
for (int i = 0; i < patterns.Count; i++)
{
int postition = BoyerMooreSearchFirst(target, patterns[i]);
if (postition > -1)
matches.Add(patterns[i], postition);
}
return matches;
}
/// <summary>
/// Searches for all occurrences of multiple patterns in a target <see cref="string"/> using Boyer-Moore's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="List{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs.
/// If a pattern is not found there is no entry in the dictionary.</returns>
public static Dictionary<string, List<int>> BoyerMooreMultipleSearchAll(string target, IList<string> patterns)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (patterns == null) throw new ArgumentNullException(nameof(patterns));
// Dictionary with matches
var matches = new Dictionary<string, List<int>>();
for (int i = 0; i < patterns.Count; i++)
{
var postitions = new List<int>(BoyerMooreSearchAll(target, patterns[i]));
if (postitions.Count > 0)
matches.Add(patterns[i], postitions);
}
return matches;
}
}
}

@ -0,0 +1,215 @@
using System;
using System.Collections.Generic;
namespace ZeroLevel.Services.Semantic.Helpers
{
public static class KnuthMorrisPratt
{
private static int[] BuildKMPTable(string pattern)
{
var kmpTable = new int[pattern.Length];
if (kmpTable.Length < 2)
{
if (kmpTable.Length > 0)
kmpTable[0] = -1;
return kmpTable;
}
int tableIndex = 2; // current position in table for computation
int patSubstrIndex = 0; // index in the pattern of the current substring
// First two values are fixed -1 and 0
kmpTable[0] = -1;
// Build table
while (tableIndex < kmpTable.Length)
{
// If the substring continues
if (pattern[tableIndex - 1] == pattern[patSubstrIndex])
{
kmpTable[tableIndex++] = ++patSubstrIndex;
}
// It does not but we can fall back
else if (patSubstrIndex != 0)
{
patSubstrIndex = kmpTable[patSubstrIndex];
}
// If we ran out of candidates
else
{
kmpTable[tableIndex++] = 0;
}
}
return kmpTable;
}
/// <summary>
/// Searches for the first occurrence of a pattern in a target <see cref="string"/> using KnuthMorrisPratt's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="pattern">The <see cref="string"/> to search for.</param>
/// <returns>Returns the position of the first occurrence of the pattern. If not found returns -1.</returns>
public static int KnuthMorrisPrattSearchFirst(string target, string pattern)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
// Build KMP table
var kmpTable = BuildKMPTable(pattern);
int matchIndex = 0; // position of the current match
int patternIndex = 0; // position in the pattern
// Save for faster access
int targetLength = target.Length;
int patternLength = pattern.Length;
while (matchIndex + patternIndex < targetLength)
{
if (pattern[patternIndex] == target[matchIndex + patternIndex])
{
patternIndex++;
if (patternIndex == patternLength)
return matchIndex;
}
else // we are not in the middle of a pattern
{
// if we can backtrack
if (kmpTable[patternIndex] > -1)
{
matchIndex = matchIndex + patternIndex - kmpTable[patternIndex];
patternIndex = kmpTable[patternIndex];
}
else // we can't backtrack (the beginning of the word)
{
matchIndex++;
patternIndex = 0;
}
}
}
// We haven't found anything
return -1;
}
/// <summary>
/// Searches for all occurences of a pattern in a target <see cref="string"/> using KnuthMorrisPratt's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="pattern">The <see cref="string"/> to search for.</param>
/// <returns>Returns <see cref="IList{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs. <see cref="IList{T}"/> is empty if none found.</returns>
public static IList<int> KnuthMorrisPrattSearchAll(string target, string pattern)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
// List with matches
var matches = new List<int>();
// Build KMP table
var kmpTable = BuildKMPTable(pattern);
int matchIndex = 0; // position of the current match
int patternIndex = 0; // position in the pattern
// Save for faster access
int targetLength = target.Length;
int patternLength = pattern.Length;
while (matchIndex + patternIndex < targetLength)
{
if (pattern[patternIndex] == target[matchIndex + patternIndex])
{
patternIndex++;
if (patternIndex == patternLength)
{
matches.Add(matchIndex);
// Find where the next match will begin
patternIndex--;
// if we can backtrack
if (kmpTable[patternIndex] > -1)
{
matchIndex = matchIndex + patternIndex - kmpTable[patternIndex];
patternIndex = kmpTable[patternIndex];
}
else // we can't backtrack (the beginning of the word)
{
matchIndex++;
patternIndex = 0;
}
}
}
else // we are not in the middle of a pattern
{
// if we can backtrack
if (kmpTable[patternIndex] > -1)
{
matchIndex = matchIndex + patternIndex - kmpTable[patternIndex];
patternIndex = kmpTable[patternIndex];
}
else // we can't backtrack (the beginning of the word)
{
matchIndex++;
patternIndex = 0;
}
}
}
// We haven't found anything
return matches;
}
/// <summary>
/// Searches for the first occurrence of multiple patterns in a target <see cref="string"/> using KnuthMorrisPratt's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="int"/> values of the position of first occurence.
/// If a pattern is not found there is no entry in the dictionary.</returns>
public static Dictionary<string, int> KnuthMorrisPrattMultipleSearchFirst(string target, IList<string> patterns)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (patterns == null) throw new ArgumentNullException(nameof(patterns));
// Dictionary with matches
var matches = new Dictionary<string, int>();
for (int i = 0; i < patterns.Count; i++)
{
int postition = KnuthMorrisPrattSearchFirst(target, patterns[i]);
if (postition > -1)
matches.Add(patterns[i], postition);
}
return matches;
}
/// <summary>
/// Searches for all occurrences of multiple patterns in a target <see cref="string"/> using KnuthMorrisPratt's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="List{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs.
/// If a pattern is not found there is no entry in the dictionary.</returns>
public static Dictionary<string, List<int>> KnuthMorrisPrattMultipleSearchAll(string target, IList<string> patterns)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (patterns == null) throw new ArgumentNullException(nameof(patterns));
// Dictionary with matches
var matches = new Dictionary<string, List<int>>();
for (int i = 0; i < patterns.Count; i++)
{
var postitions = new List<int>(KnuthMorrisPrattSearchAll(target, patterns[i]));
if (postitions.Count > 0)
matches.Add(patterns[i], postitions);
}
return matches;
}
}
}

@ -0,0 +1,345 @@
using System;
using System.Collections.Generic;
namespace ZeroLevel.Services.Semantic.Helpers
{
public static class RabinKarp
{
/// <summary>
/// Searches for the first occurrence of a pattern in a target <see cref="string"/> using Rabin-Karp's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="pattern">The <see cref="string"/> to search for.</param>
/// <returns>Returns the position of the first occurrence of the pattern. If not found returns -1.</returns>
public static int RabinKarpSearchFirst(string target, string pattern)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
// Save for faster access
int patternLength = pattern.Length;
if (target.Length < patternLength) return -1;
ulong targetHash = 0;
ulong patternHash = 0;
ulong alphabetSize = 256; // max char value
ulong moduloValue = 65537; // custom selected prime number for the hashing
// Calculating hash of pattern and the beggining of target
for (int i = 0; i < patternLength; i++)
{
patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i]) % moduloValue;
}
// Check if pattern is in the beginning
if (patternHash == targetHash)
if (string.Equals(target.Substring(0, patternLength), pattern))
return 0;
// Calculate pow value (used in the hashing proccess)
ulong pow = 1;
for (int i = 0; i < patternLength - 1; i++)
{
pow = (pow * alphabetSize) % moduloValue;
}
// Hashing the rest of the target and searching for the pattern
int endOfSearch = target.Length - patternLength;
for (int i = 0; i < endOfSearch; i++)
{
// Some Rabin-Karp magic
targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
// If the hashes are equal check the string( because collisions are possible) and return if found
if (targetHash == patternHash)
if (string.Equals(target.Substring(i + 1, patternLength), pattern))
return i + 1;
}
// The pattern was not found
return -1;
}
/// <summary>
/// Searches for all occurences of a pattern in a target <see cref="string"/> using Rabin-Karp's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="pattern">The <see cref="string"/> to search for.</param>
/// <returns>Returns <see cref="IList{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs. <see cref="IList{T}"/> is empty if none found.</returns>
public static IList<int> RabinKarpSearchAll(string target, string pattern)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
// Save for faster access
int patternLength = pattern.Length;
// List with the positions where the pattern was found
var matches = new List<int>();
if (target.Length < patternLength) return matches;
ulong targetHash = 0;
ulong patternHash = 0;
ulong alphabetSize = 256; // max char value
ulong moduloValue = 65537; // custom selected prime number for the hashing
// Calculating hash of pattern and the beggining of target
for (int i = 0; i < patternLength; i++)
{
patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i]) % moduloValue;
}
// Check if pattern is in the beginning
if (patternHash == targetHash)
if (string.Equals(target.Substring(0, patternLength), pattern))
matches.Add(0);
// Calculate pow value (used in the hashing proccess)
ulong pow = 1;
for (int i = 0; i < patternLength - 1; i++)
{
pow = (pow * alphabetSize) % moduloValue;
}
// Hashing the rest of the target and searching for the pattern
int endOfSearch = target.Length - patternLength;
for (int i = 0; i < endOfSearch; i++)
{
// Some Rabin-Karp magic
targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
// If the hashes are equal check the string( because collisions are possible) and return if found
if (targetHash == patternHash)
if (string.Equals(target.Substring(i + 1, patternLength), pattern))
matches.Add(i + 1);
}
// Retrun the list with all starting positions of the pattern
return matches;
}
/// <summary>
/// Searches for the first occurrence of multiple patterns in a target <see cref="string"/> using Rabin-Karp's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="int"/> values of the position of first occurence.
/// If a pattern is not found there is no entry in the dictionary.</returns>
public static Dictionary<string, int> RabinKarpMultipleSearchFirst(string target, IList<string> patterns)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (patterns == null) throw new ArgumentNullException(nameof(patterns));
// Dictionary with pattern hashes for all strings
var patternHashes = new Dictionary<string, ulong>();
// Dictionary with target hashes for all different string lengths
var targetHashes = new Dictionary<int, ulong>();
// Dictionary with pow values for all different string lengths
var pows = new Dictionary<int, ulong>();
// Dictionary with all strings with a specific length
var patternLengths = new Dictionary<int, List<string>>();
// Dictionary with found positions for every string
var matches = new Dictionary<string, int>();
ulong alphabetSize = 256; // max char value
ulong moduloValue = 65537; // custom selected prime number for the hashing
// Calculating hash of patterns and all target hashes and pow values
for (int i = 0; i < patterns.Count; i++)
{
// Chech if target hash for current string length has to be computed
bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length);
// Populate pattern lengths dictionary
if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List<string>() { patterns[i] });
else patternLengths[patterns[i].Length].Add(patterns[i]);
ulong patternHash = 0;
ulong targetHash = 0;
ulong pow = 1;
for (int j = 0; j < patterns[i].Length; j++)
{
patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue;
if (hasToComputeTargetHashAndPow)
{
targetHash = (targetHash * alphabetSize + target[j]) % moduloValue;
if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration
pow = (pow * alphabetSize) % moduloValue;
}
}
// Add hashes in collections
patternHashes.Add(patterns[i], patternHash);
if (hasToComputeTargetHashAndPow)
{
targetHashes.Add(patterns[i].Length, targetHash);
pows.Add(patterns[i].Length, pow);
}
}
// Check if pattern is in the beginning
foreach (var patKVP in patternHashes)
{
if (patKVP.Value == targetHashes[patKVP.Key.Length])
if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key))
matches.Add(patKVP.Key, 0);
}
// Hashing the rest of the target and searching for the pattern
// Patters are grouped by their length
foreach (var patternsWithSpecificLength in patternLengths)
{
int patternLength = patternsWithSpecificLength.Key;
int endOfSearch = target.Length - patternLength;
for (int i = 0; i < endOfSearch; i++)
{
ulong targetHash = targetHashes[patternLength];
// Some Rabin-Karp magic
targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
targetHashes[patternLength] = targetHash;
// Search all patterns for a match
foreach (var pat in patternsWithSpecificLength.Value)
{
if (!matches.ContainsKey(pat))
{
// If the hashes are equal check the string( because collisions are possible) and return if found
if (targetHash == patternHashes[pat])
if (string.Equals(target.Substring(i + 1, patternLength), pat))
matches.Add(pat, i + 1);
}
if (matches.Count == patterns.Count) return matches;
}
if (matches.Count == patterns.Count) return matches;
}
}
// Return matches
return matches;
}
/// <summary>
/// Searches for all occurrences of multiple patterns in a target <see cref="string"/> using Rabin-Karp's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="List{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs.
/// If a pattern is not found there is no entry in the dictionary.</returns>
public static Dictionary<string, List<int>> RabinKarpMultipleSearchAll(string target, IList<string> patterns)
{
if (target == null) throw new ArgumentNullException(nameof(target));
if (patterns == null) throw new ArgumentNullException(nameof(patterns));
// Dictionary with pattern hashes for all strings
var patternHashes = new Dictionary<string, ulong>();
// Dictionary with target hashes for all different string lengths
var targetHashes = new Dictionary<int, ulong>();
// Dictionary with pow values for all different string lengths
var pows = new Dictionary<int, ulong>();
// Dictionary with all strings with a specific length
var patternLengths = new Dictionary<int, List<string>>();
// Dictionary with found positions for every string
var matches = new Dictionary<string, List<int>>();
ulong alphabetSize = 256; // max char value
ulong moduloValue = 65537; // custom selected prime number for the hashing
// Calculating hash of patterns and all target hashes and pow values
for (int i = 0; i < patterns.Count; i++)
{
// Chech if target hash for current string length has to be computed
bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length);
// Populate matches dictionary and pattern lengths dictionary
matches.Add(patterns[i], new List<int>());
if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List<string>() { patterns[i] });
else patternLengths[patterns[i].Length].Add(patterns[i]);
ulong patternHash = 0;
ulong targetHash = 0;
ulong pow = 1;
for (int j = 0; j < patterns[i].Length; j++)
{
patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue;
if (hasToComputeTargetHashAndPow)
{
targetHash = (targetHash * alphabetSize + target[j]) % moduloValue;
if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration
pow = (pow * alphabetSize) % moduloValue;
}
}
// Add hashes in collections
patternHashes.Add(patterns[i], patternHash);
if (hasToComputeTargetHashAndPow)
{
targetHashes.Add(patterns[i].Length, targetHash);
pows.Add(patterns[i].Length, pow);
}
}
// Check if pattern is in the beginning
foreach (var patKVP in patternHashes)
{
if (patKVP.Value == targetHashes[patKVP.Key.Length])
if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key))
matches[patKVP.Key].Add(0);
}
// Hashing the rest of the target and searching for the pattern
// Patters are grouped by their length
foreach (var patternsWithSpecificLength in patternLengths)
{
int patternLength = patternsWithSpecificLength.Key;
int endOfSearch = target.Length - patternLength;
for (int i = 0; i < endOfSearch; i++)
{
ulong targetHash = targetHashes[patternLength];
// Some Rabin-Karp magic
targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
targetHashes[patternLength] = targetHash;
// Search all patterns for a match
foreach (var pat in patternsWithSpecificLength.Value)
{
// If the hashes are equal check the string( because collisions are possible) and return if found
if (targetHash == patternHashes[pat])
if (string.Equals(target.Substring(i + 1, patternLength), pat))
matches[pat].Add(i + 1);
}
}
}
// Remove all patterns that are not found
for (int i = 0; i < patterns.Count; i++)
{
if (matches[patterns[i]].Count == 0)
{
matches.Remove(patterns[i]);
}
}
// Return matches
return matches;
}
}
}
Loading…
Cancel
Save

Powered by TurnKey Linux.