diff --git a/TestApp/Program.cs b/TestApp/Program.cs index befde05..04923dd 100644 --- a/TestApp/Program.cs +++ b/TestApp/Program.cs @@ -1,5 +1,4 @@ using Newtonsoft.Json; -using System; using ZeroLevel; using ZeroLevel.Logging; diff --git a/ZeroLevel.UnitTests/BloomFilterTest.cs b/ZeroLevel.UnitTests/BloomFilterTest.cs new file mode 100644 index 0000000..4495a0c --- /dev/null +++ b/ZeroLevel.UnitTests/BloomFilterTest.cs @@ -0,0 +1,111 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using Xunit; +using ZeroLevel.DataStructures; + +namespace ZeroLevel.UnitTests +{ + public class BloomFilterTest + { + private static Random random = new Random(); + + public static string RandomString(int length) + { + const string chars = "abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + return new string(Enumerable.Repeat(chars, length) + .Select(s => s[random.Next(s.Length)]).ToArray()); + } + + [Fact] + public void SimpleBloomFilterTest() + { + // Arrange + var size = 100000; + var lines = new HashSet(size); + var lines_another = new HashSet(size); + for (int i = 0; i < size; i++) + { + lines.Add(RandomString(i % 9 + 5)); + lines_another.Add(RandomString(i % 9 + 5)); + } + var bloom = new BloomFilter(16536 * 1024, true); + // Act + var sw = new Stopwatch(); + sw.Start(); + foreach (var line in lines) + { + bloom.Add(line); + } + sw.Stop(); + Debug.Print($"BloomFilter. Append {lines.Count} items. {sw.ElapsedMilliseconds} ms"); + + // Assert + foreach (var line in lines) + { + Assert.True(bloom.Contains(line)); + } + + int collision_count = 0; + + foreach (var line in lines_another) + { + if (bloom.Contains(line)) + { + if (false == lines.Contains(line)) + { + collision_count++; + } + } + } + + Debug.WriteLine($"Collision for string: {collision_count}."); + } + + [Fact] + public void HyperBloomBloomFilterTest() + { + // Arrange + var size = 100000; + var lines = new HashSet(size); + var lines_another = new HashSet(size); + for (int i = 0; i < size; i++) + { + lines.Add(RandomString(i % 9 + 5)); + lines_another.Add(RandomString(i % 9 + 5)); + } + var bloom = new HyperBloomBloom(16536 * 1024, true); + // Act + var sw = new Stopwatch(); + sw.Start(); + foreach (var line in lines) + { + bloom.Add(line); + } + sw.Stop(); + Debug.Print($"BloomFilter. Append {lines.Count} items. {sw.ElapsedMilliseconds} ms"); + + // Assert + foreach (var line in lines) + { + Assert.True(bloom.Contains(line)); + } + + int collision_count = 0; + + foreach (var line in lines_another) + { + if (bloom.Contains(line)) + { + if (false == lines.Contains(line)) + { + collision_count++; + } + } + } + + Debug.WriteLine($"Collision for string: {collision_count}."); + } + } +} diff --git a/ZeroLevel/DataStructures/BloomFilter.cs b/ZeroLevel/DataStructures/BloomFilter.cs new file mode 100644 index 0000000..f33989e --- /dev/null +++ b/ZeroLevel/DataStructures/BloomFilter.cs @@ -0,0 +1,176 @@ +using System; +using System.Collections; +using System.Linq; +using System.Runtime.CompilerServices; +using ZeroLevel.Services.HashFunctions; + +namespace ZeroLevel.DataStructures +{ + /// + /// Bloom filter implementation, 128 bit + /// + public class BloomFilter + { + #region Private + private struct HIND + { + public ulong PrimiryDirect; + public uint SecondDirect; + public uint ThirdDirect; + public ulong PrimiryReverse; + public uint SecondReverse; + public uint ThirdReverse; + } + + private readonly BitArray _primary; + private readonly BitArray _second; + private readonly BitArray _third; + + private readonly BitArray _r_primary; + private readonly BitArray _r_second; + private readonly BitArray _r_third; + + private readonly bool _use_reverse = false; + #endregion + + public BloomFilter(int bit_size, bool use_reverse) + { + _use_reverse = use_reverse; + + _primary = new BitArray(bit_size); + _second = new BitArray(bit_size); + _third = new BitArray(bit_size); + + if (_use_reverse) + { + _r_primary = new BitArray(bit_size); + _r_second = new BitArray(bit_size); + _r_third = new BitArray(bit_size); + } + } + + public void Add(string item) + { + if (item == null || item.Length == 0) return; + var hind = Compute(item); + Add(hind); + } + + public bool Contains(string item) + { + if (item == null || item.Length == 0) return true; + var hind = Compute(item); + return Contains(hind); + } + /// + /// true if added, false if already exists + /// + public bool TryAdd(string item) + { + if (item == null || item.Length == 0) return false; + var hind = Compute(item); + if (Contains(hind)) + { + return false; + } + Add(hind); + return true; + } + + private HIND Compute(string line) + { + var hind = new HIND + { + PrimiryDirect = HashUL(line), + SecondDirect = HashXX(line), + ThirdDirect = HashMM(line) + }; + if(_use_reverse) + { + var r = Reverse(line); + hind.PrimiryReverse = HashUL(r); + hind.SecondReverse = HashXX(r); + hind.ThirdReverse = HashMM(r); + } + return hind; + } + + public static string Reverse(string s) + { + char[] charArray = s.ToCharArray(); + Array.Reverse(charArray); + return new string(charArray); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void Add(HIND hind) + { + int pi = (int)(hind.PrimiryDirect % (ulong)_primary.Length); + _primary[pi] = true; + + int si = (int)(hind.SecondDirect % (uint)_second.Length); + _second[si] = true; + + int ti = (int)(hind.ThirdDirect % (uint)_third.Length); + _third[ti] = true; + + if (_use_reverse) + { + int rpi = (int)(hind.PrimiryReverse % (ulong)_primary.Length); + _r_primary[rpi] = true; + + int rsi = (int)(hind.SecondReverse % (uint)_second.Length); + _r_second[rsi] = true; + + int rti = (int)(hind.ThirdReverse % (uint)_third.Length); + _r_third[rti] = true; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool Contains(HIND hind) + { + int pi = (int)(hind.PrimiryDirect % (ulong)_primary.Length); + if (!_primary[pi]) return false; + + int si = (int)(hind.SecondDirect % (uint)_second.Length); + if (!_second[si]) return false; + + int ti = (int)(hind.ThirdDirect % (uint)_third.Length); + if (!_third[ti]) return false; + + if (_use_reverse) + { + int rpi = (int)(hind.PrimiryReverse % (ulong)_primary.Length); + if (!_r_primary[rpi]) return false; + + int rsi = (int)(hind.SecondReverse % (uint)_second.Length); + if (!_r_second[rsi]) return false; + + int rti = (int)(hind.ThirdReverse % (uint)_third.Length); + if (!_r_third[rti]) return false; + } + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private ulong HashUL(string line) + { + return XXH3_64.Hash(line); + } + + private readonly XXHashUnsafe _hash_xx_32 = new XXHashUnsafe(); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private uint HashXX(string line) + { + return _hash_xx_32.Hash(line); + } + + private readonly Murmur3Unsafe _hash_mm_32 = new Murmur3Unsafe(); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private uint HashMM(string line) + { + return _hash_mm_32.Hash(line); + } + } +} diff --git a/ZeroLevel/DataStructures/HyperBloomBloom.cs b/ZeroLevel/DataStructures/HyperBloomBloom.cs new file mode 100644 index 0000000..545e45e --- /dev/null +++ b/ZeroLevel/DataStructures/HyperBloomBloom.cs @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; + +namespace ZeroLevel.DataStructures +{ + public class HyperBloomBloom + { + private BloomFilter _trash; + private Dictionary _shardes = new Dictionary(); + + public HyperBloomBloom(int bit_size, bool use_reverse) + { + _trash = new BloomFilter(bit_size, use_reverse); + foreach (var ch in "abcdefghijklmnopqrstuvwxyz0123456789-") + { + _shardes.Add(ch, new BloomFilter(bit_size, use_reverse)); + } + } + + public void Add(string item) + { + if (item == null || item.Length == 0) return; + var k = Char.ToLowerInvariant(item[0]); + BloomFilter filter; + if (_shardes.TryGetValue(k, out filter) == false) filter = _trash; + filter.Add(item); + } + + public bool Contains(string item) + { + if (item == null || item.Length == 0) return true; + var k = Char.ToLowerInvariant(item[0]); + BloomFilter filter; + if (_shardes.TryGetValue(k, out filter) == false) filter = _trash; + return filter.Contains(item); + } + /// + /// true if added, false if already exists + /// + public bool TryAdd(string item) + { + if (item == null || item.Length == 0) return false; + var k = Char.ToLowerInvariant(item[0]); + BloomFilter filter; + if (_shardes.TryGetValue(k, out filter) == false) filter = _trash; + return filter.TryAdd(item); + } + } +} diff --git a/ZeroLevel/Services/Collections/SparceVector.cs b/ZeroLevel/DataStructures/SparceVector.cs similarity index 98% rename from ZeroLevel/Services/Collections/SparceVector.cs rename to ZeroLevel/DataStructures/SparceVector.cs index 104a4ee..fc11f69 100644 --- a/ZeroLevel/Services/Collections/SparceVector.cs +++ b/ZeroLevel/DataStructures/SparceVector.cs @@ -3,7 +3,7 @@ using System.Collections.Generic; using ZeroLevel.Models; using ZeroLevel.Services.Serialization; -namespace ZeroLevel.Services.Semantic.Helpers +namespace ZeroLevel.DataStructures { public sealed class SparceVector : IBinarySerializable diff --git a/ZeroLevel/DataStructures/SparseMatrix.cs b/ZeroLevel/DataStructures/SparseMatrix.cs new file mode 100644 index 0000000..4dd46fb --- /dev/null +++ b/ZeroLevel/DataStructures/SparseMatrix.cs @@ -0,0 +1,257 @@ +using System.Collections.Generic; +using System.Linq; + +namespace ZeroLevel.DataStructures +{ + /// + /// Represents a Sparse matrix. + /// + /// The type of the stored value. + public class SparseMatrix + { + /// + /// Dictionary containing the row index as a key and as a value another dictionary + /// containing the column index as a key and the stored value as a value. + /// + internal Dictionary> rows; + + /// + /// Dictionary containing the column index as a key and as a value another dictionary + /// containing the row index as a key and the stored value as a value. + /// + internal Dictionary> cols; + + /// + /// Gets the maximum reached height of the sparse matrix. + /// + public int Height { get; internal set; } + + /// + /// Gets the maximum reached width of the sparse matrix. + /// + public int Width { get; internal set; } + + /// + /// Gets the number of items in the . + /// + public int Count { get; internal set; } + + /// + /// Gets or sets an item in the sparse matrix. If there is no item on the given position + /// on get the default value of T is returned and on set the item is added to the matrix. + /// + /// The zero-based row index of the item. + /// The zero-based column index of the item. + /// Returns the item in the sparse matrix. If there is no item on the given position the default value of T is returned instead. + public T this[int row, int col] + { + get + { + if (rows.ContainsKey(row)) + { + if (rows[row].ContainsKey(col)) + { + return rows[row][col]; + } + } + + //If there is no item on the given position return defaault value + return default(T); + } + set + { + if (row >= Height) Height = row + 1; + if (col >= Width) Width = col + 1; + + //If no items on the current row we have to create a new dictionary + if (!rows.ContainsKey(row)) + rows.Add(row, new Dictionary()); + + //If no items on the current col we have to create a new dictionary + if (!cols.ContainsKey(col)) + cols.Add(col, new Dictionary()); + + rows[row][col] = value; + cols[col][row] = value; + + Count++; + } + } + + /// + /// Creates a new instance of the class. + /// + public SparseMatrix() + { + rows = new Dictionary>(); + cols = new Dictionary>(); + } + + /// + /// Creates a new instance of the class from the given two dimensional array. + /// + /// The two dimensional array of items to add. + /// The item considered a zero item. All items from the array equal to the zero item won't be added to the matrix. + public SparseMatrix(T[,] array, T zeroItem) + { + rows = new Dictionary>(); + cols = new Dictionary>(); + + for (int row = 0; row < array.GetLength(0); row++) + { + for (int col = 0; col < array.GetLength(1); col++) + { + if (!object.Equals(array[row, col], zeroItem)) + this[row, col] = array[row, col]; + } + } + } + + /// + /// Determines if there is an item on the given position. + /// + /// The zero-based row index of the item. + /// The zero-based column index of the item. + /// Returns true if there is an item on the given position; otherwise false. + public bool IsCellEmpty(int row, int col) + { + if (rows.ContainsKey(row)) + { + if (rows[row].ContainsKey(col)) + { + return false; + } + } + + return true; + } + + /// + /// Gets the items in the given row sorted by the column index as an + /// of with the key being the column index and the value being the item. + /// + /// The zero-based row index. + /// Returns an of + /// with the key being the column index and the value being the item. + public IEnumerable> GetRowItems(int row) + { + if (rows.ContainsKey(row)) + { + var sortedDict = new SortedDictionary(rows[row]); + foreach (var item in sortedDict) + { + yield return item; + } + } + } + + /// + /// Gets the items in the given column sorted by the row index as an + /// of with the key being the row index and the value being the item. + /// + /// The zero-based column index. + /// Returns an of + /// with the key being the row index and the value being the item. + public IEnumerable> GetColumnItems(int col) + { + if (cols.ContainsKey(col)) + { + var sortedDict = new SortedDictionary(cols[col]); + foreach (var item in sortedDict) + { + yield return item; + } + } + } + + /// + /// Gets non empty rows indexes sorted in ascending order. + /// + /// Returns an of integers being row indexes sorted in ascending order. + public IEnumerable GetNonEmptyRows() + { + var sortedRows = new SortedSet(rows.Keys); + foreach (var row in sortedRows) + { + yield return row; + } + } + + /// + /// Gets non empty columns indexes sorted in ascending order. + /// + /// Returns an of integers being column indexes sorted in ascending order. + public IEnumerable GetNonEmptyColumns() + { + var sortedCols = new SortedSet(cols.Keys); + foreach (var col in sortedCols) + { + yield return col; + } + } + + /// + /// Removes the item on the given position. + /// + /// The zero-based row index. + /// The zero-based column index. + /// Returns true if item is removed successfully; otherwise false. Also returns false if the item is not found. + public bool Remove(int row, int col) + { + if (rows.ContainsKey(row)) + { + if (rows[row].ContainsKey(col)) + { + bool removedSuccessfully = true; + if (!rows[row].Remove(col) || !cols[col].Remove(row)) removedSuccessfully = false; + + if (rows[row].Count == 0) + { + rows.Remove(row); + } + + if (cols[col].Count == 0) + { + cols.Remove(col); + } + + if (removedSuccessfully) + Count--; + + return removedSuccessfully; + } + } + + return false; + } + + /// + /// Removes all elements from the sparse matrix. + /// + public void Clear() + { + rows.Clear(); + cols.Clear(); + + Count = 0; + Height = 0; + Width = 0; + } + + /// + /// Updates the height and the width of the matrix. If no items were removed from the matrix the dimensions will be correct. + /// + public void UpdateDimensions() + { + if (rows.Count == 0) + { + Height = 0; + Width = 0; + return; + } + + Height = rows.Keys.Max() + 1; + Width = cols.Keys.Max() + 1; + } + } +} diff --git a/ZeroLevel/Services/Collections/FastBitArray.cs b/ZeroLevel/Services/Collections/FastBitArray.cs new file mode 100644 index 0000000..30fdc95 --- /dev/null +++ b/ZeroLevel/Services/Collections/FastBitArray.cs @@ -0,0 +1,636 @@ +using System; +using System.Collections; +using System.Collections.Concurrent; +using System.Diagnostics.Contracts; +using System.Threading; +using System.Threading.Tasks; + +namespace ZeroLevel.Collections +{ + // A vector of bits. Use this to store bits efficiently, without having to do bit + // shifting yourself. + [System.Runtime.InteropServices.ComVisible(true)] + [Serializable()] + public class FastBitArray : ICollection, ICloneable + { + private FastBitArray() + { + } + + /*========================================================================= + ** Allocates space to hold length bit values. All of the values in the bit + ** array are set to false. + + ** Exceptions: ArgumentException if length < 0. + =========================================================================*/ + public FastBitArray(int length) + : this(length, false) + + { + } + + /*========================================================================= + ** Allocates space to hold length bit values. All of the values in the bit + ** array are set to defaultValue. + ** + ** Exceptions: ArgumentOutOfRangeException if length < 0. + =========================================================================*/ + public FastBitArray(int length, bool defaultValue) + { + if (length < 0) + { + throw new ArgumentOutOfRangeException(nameof(length), "The length should be at least zero."); + } + Contract.EndContractBlock(); + m_array = new int[GetArrayLength(length, BitsPerInt32)]; + m_length = length; + int fillValue = defaultValue ? unchecked(((int)0xffffffff)) : 0; + for (int i = 0; i < m_array.Length; i++) + { + m_array[i] = fillValue; + } + _version = 0; + } + + /*========================================================================= + ** Allocates space to hold the bit values in bytes. bytes[0] represents + ** bits 0 - 7, bytes[1] represents bits 8 - 15, etc. The LSB of each byte + ** represents the lowest index value; bytes[0] & 1 represents bit 0, + ** bytes[0] & 2 represents bit 1, bytes[0] & 4 represents bit 2, etc. + + ** Exceptions: ArgumentException if bytes == null. + =========================================================================*/ + public FastBitArray(byte[] bytes) + { + if (bytes == null) + { + throw new ArgumentNullException(nameof(bytes)); + } + Contract.EndContractBlock(); + // this value is chosen to prevent overflow when computing m_length. + // m_length is of type int32 and is exposed as a property, so + // type of m_length can't be changed to accommodate. + if (bytes.Length > Int32.MaxValue / BitsPerByte) + { + throw new ArgumentException($"The array is too large {BitsPerByte}", nameof(bytes)); + } + m_array = new int[GetArrayLength(bytes.Length, BytesPerInt32)]; + m_length = bytes.Length * BitsPerByte; + Parallel + .ForEach( + Partitioner.Create(0, m_array.Length), + (range, state) => + { + for (var i = range.Item1; i < range.Item2; i++) + { + var idx = i * BytesPerInt32; + if (idx + 3 < bytes.Length) + { + m_array[i] = (bytes[idx] & 0xff) | + ((bytes[idx + 1] & 0xff) << 8) | + ((bytes[idx + 2] & 0xff) << 16) | + ((bytes[idx + 3] & 0xff) << 24); + } + } + }); + + var j = Math.Max(0, bytes.Length - (bytes.Length % BytesPerInt32)); + var last = Math.Max(0, m_array.Length - 1); + switch (bytes.Length - j) + { + case 3: + m_array[last] = ((bytes[j + 2] & 0xff) << 16); + goto case 2; + // fall through + case 2: + m_array[last] |= ((bytes[j + 1] & 0xff) << 8); + goto case 1; + // fall through + case 1: + m_array[last] |= (bytes[j] & 0xff); + break; + } + _version = 0; + } + + public FastBitArray(bool[] values) + { + SetValues(values); + _version = 0; + } + + /*========================================================================= + ** Allocates space to hold the bit values in values. values[0] represents + ** bits 0 - 31, values[1] represents bits 32 - 63, etc. The LSB of each + ** integer represents the lowest index value; values[0] & 1 represents bit + ** 0, values[0] & 2 represents bit 1, values[0] & 4 represents bit 2, etc. + + ** Exceptions: ArgumentException if values == null. + =========================================================================*/ + public FastBitArray(int[] values) + { + SetValues(values); + _version = 0; + } + + private void SetValues(int[] values) + { + if (values == null) + { + throw new ArgumentNullException(nameof(values)); + } + Contract.EndContractBlock(); + // this value is chosen to prevent overflow when computing m_length + if (values.Length > Int32.MaxValue / BitsPerInt32) + { + throw new ArgumentException($"The array is too large: {BitsPerInt32}", nameof(values)); + } + m_array = new int[values.Length]; + m_length = values.Length * BitsPerInt32; + Array.Copy(values, m_array, values.Length); + } + + /*========================================================================= + ** Allocates a new BitArray with the same length and bit values as bits. + ** + ** Exceptions: ArgumentException if bits == null. + =========================================================================*/ + public FastBitArray(FastBitArray bits) + { + if (bits == null) + { + throw new ArgumentNullException(nameof(bits)); + } + Contract.EndContractBlock(); + int arrayLength = GetArrayLength(bits.m_length, BitsPerInt32); + m_array = new int[arrayLength]; + m_length = bits.m_length; + Array.Copy(bits.m_array, m_array, arrayLength); + _version = bits._version; + } + + public bool this[int index] + { + get + { + return Get(index); + } + set + { + Set(index, value); + } + } + + private void SetValues(bool[] values) + { + if (values == null) + { + throw new ArgumentNullException(nameof(values)); + } + Contract.EndContractBlock(); + m_array = new int[GetArrayLength(values.Length, BitsPerInt32)]; + m_length = values.Length; + Parallel + .ForEach( + Partitioner.Create(0, m_array.Length), + (range, state) => + { + for (var i = range.Item1; i < range.Item2; i++) + { + var idx = i * BitsPerInt32; + for (int j = 0; j < BitsPerInt32 && idx < values.Length; j++, idx++) + { + if (values[idx]) + { + m_array[i] |= (1 << j); + } + } + } + }); + } + + /*========================================================================= + ** Returns the bit value at position index. + + ** Exceptions: ArgumentOutOfRangeException if index < 0 or + ** index >= GetLength(). + =========================================================================*/ + public bool Get(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index), "Index is out of range"); + } + Contract.EndContractBlock(); + return (m_array[index / 32] & (1 << (index % 32))) != 0; + } + + /*========================================================================= + ** Sets the bit value at position index to value. + + ** Exceptions: ArgumentOutOfRangeException if index < 0 or + ** index >= GetLength(). + =========================================================================*/ + public void Set(int index, bool value) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index), "Index is out of range"); + } + Contract.EndContractBlock(); + if (value) + { + m_array[index / 32] |= (1 << (index % 32)); + } + else + { + m_array[index / 32] &= ~(1 << (index % 32)); + } + _version++; + } + + /*========================================================================= + ** Sets all the bit values to value. + =========================================================================*/ + public void SetAll(bool value) + { + int fillValue = value ? unchecked(((int)0xffffffff)) : 0; + Parallel + .ForEach( + Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)), + (range, state) => + { + for (var i = range.Item1; i < range.Item2; i++) + { + m_array[i] = fillValue; + } + }); + Interlocked.Increment(ref _version); + } + + /*========================================================================= + ** Returns a reference to the current instance ANDed with value. + + ** Exceptions: ArgumentException if value == null or + ** value.Length != this.Length. + =========================================================================*/ + public FastBitArray And(FastBitArray value) + { + if (value == null) + throw new ArgumentNullException(nameof(value)); + if (Length != value.Length) + throw new ArgumentException("The array lengths differ."); + Contract.EndContractBlock(); + Parallel + .ForEach( + Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)), + (range, state) => + { + for (var i = range.Item1; i < range.Item2; i++) + { + m_array[i] &= value.m_array[i]; + } + }); + Interlocked.Increment(ref _version); + return this; + } + + /*========================================================================= + ** Returns a reference to the current instance ORed with value. + + ** Exceptions: ArgumentException if value == null or + ** value.Length != this.Length. + =========================================================================*/ + public FastBitArray Or(FastBitArray value) + { + if (value == null) + throw new ArgumentNullException(nameof(value)); + if (Length != value.Length) + throw new ArgumentException("The array lengths differ"); + Contract.EndContractBlock(); + Parallel + .ForEach( + Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)), + (range, state) => + { + for (var i = range.Item1; i < range.Item2; i++) + { + m_array[i] |= value.m_array[i]; + } + }); + Interlocked.Increment(ref _version); + return this; + } + + public FastBitArray Fold(uint factor, bool inPlace) + { + if (factor <= 0) + throw new ArgumentException($"Fold factor should be a positive number (given value was {factor}."); + if (Length % factor != 0) + { + throw new ArgumentException( + $"Fast bit array of size {Length} cannot be folded by a factor {factor}.", nameof(factor)); + } + Contract.EndContractBlock(); + if (factor == 1) return this; + int newLength = (int)(Length / factor); + int arrayLength = GetArrayLength(newLength, BitsPerInt32); + var newValues = new int[arrayLength]; + Parallel + .ForEach( + Partitioner.Create(0, arrayLength), + (range, state) => + { + for (var i = range.Item1; i < range.Item2; i++) + { + var idx = i * BitsPerInt32; + for (var j = 0; j < BitsPerInt32 && idx < newLength; j++, idx++) + { + if (GetFolded(this, idx, factor, newLength)) + { + newValues[i] |= (1 << j); + } + + } + } + }); + if (!inPlace) + { + var res = new FastBitArray(newValues); + res.m_length = newLength; + return res; + } + SetValues(newValues); + m_length = newLength; + Interlocked.Increment(ref _version); + return this; + } + + private static bool GetFolded(FastBitArray bitArray, int position, uint foldFactor, int foldedSize) + { + if (foldFactor == 1) return bitArray[position]; + for (var i = 0; i < foldFactor; i++) + { + if (bitArray.Get(position + i * foldedSize)) + return true; + } + return false; + } + + /*========================================================================= + ** Returns a reference to the current instance XORed with value. + + ** Exceptions: ArgumentException if value == null or + ** value.Length != this.Length. + =========================================================================*/ + public FastBitArray Xor(FastBitArray value) + { + if (value == null) + throw new ArgumentNullException(nameof(value)); + if (Length != value.Length) + throw new ArgumentException("The array lengths differ"); + Contract.EndContractBlock(); + Parallel.ForEach( + Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)), + (range, state) => + { + for (var i = range.Item1; i < range.Item2; i++) + { + m_array[i] ^= value.m_array[i]; + } + }); + Interlocked.Increment(ref _version); + return this; + } + + /*========================================================================= + ** Inverts all the bit values. On/true bit values are converted to + ** off/false. Off/false bit values are turned on/true. The current instance + ** is updated and returned. + =========================================================================*/ + public FastBitArray Not() + { + Parallel.ForEach( + Partitioner.Create(0, GetArrayLength(m_length, BitsPerInt32)), + (range, state) => + { + for (var i = range.Item1; i < range.Item2; i++) + { + m_array[i] = ~m_array[i]; + } + }); + Interlocked.Increment(ref _version); + return this; + } + + public int Length + { + get + { + Contract.Ensures(Contract.Result() >= 0); + return m_length; + } + set + { + if (value < 0) + { + throw new ArgumentOutOfRangeException(nameof(value), "The length cannot be less than 0."); + } + Contract.EndContractBlock(); + int newints = GetArrayLength(value, BitsPerInt32); + if (newints > m_array.Length || newints + _ShrinkThreshold < m_array.Length) + { + // grow or shrink (if wasting more than _ShrinkThreshold ints) + int[] newarray = new int[newints]; + Array.Copy(m_array, newarray, newints > m_array.Length ? m_array.Length : newints); + m_array = newarray; + } + if (value > m_length) + { + // clear high bit values in the last int + int last = GetArrayLength(m_length, BitsPerInt32) - 1; + int bits = m_length % 32; + if (bits > 0) + { + m_array[last] &= (1 << bits) - 1; + } + // clear remaining int values + Array.Clear(m_array, last + 1, newints - last - 1); + } + m_length = value; + _version++; + } + } + + // ICollection implementation + public void CopyTo(Array array, int index) + { + if (array == null) + throw new ArgumentNullException(nameof(array)); + if (index < 0) + throw new ArgumentOutOfRangeException(nameof(index), "The index cannot be less than 0."); + if (array.Rank != 1) + throw new ArgumentException("Multi dimensional arrays are not supported."); + Contract.EndContractBlock(); + if (array is int[]) + { + Array.Copy(m_array, 0, array, index, GetArrayLength(m_length, BitsPerInt32)); + } + else if (array is byte[]) + { + int arrayLength = GetArrayLength(m_length, BitsPerByte); + if ((array.Length - index) < arrayLength) + throw new ArgumentException("The offset is invalid."); + byte[] b = (byte[])array; + for (int i = 0; i < arrayLength; i++) + b[index + i] = (byte)((m_array[i / 4] >> ((i % 4) * 8)) & 0x000000FF); // Shift to bring the required byte to LSB, then mask + } + else if (array is bool[]) + { + if (array.Length - index < m_length) + throw new ArgumentException("The offset is invalid."); + bool[] b = (bool[])array; + for (int i = 0; i < m_length; i++) + b[index + i] = ((m_array[i / 32] >> (i % 32)) & 0x00000001) != 0; + } + else + throw new ArgumentException("The offset is invalid."); + } + + public int Count + { + get + { + Contract.Ensures(Contract.Result() >= 0); + return m_length; + } + } + + public Object Clone() + { + Contract.Ensures(Contract.Result() != null); + Contract.Ensures(((FastBitArray)Contract.Result()).Length == this.Length); + return new FastBitArray(this); + } + + public Object SyncRoot + { + get + { + if (_syncRoot == null) + { + System.Threading.Interlocked.CompareExchange(ref _syncRoot, new Object(), null); + } + return _syncRoot; + } + } + + public bool IsReadOnly + { + get + { + return false; + } + } + + public bool IsSynchronized + { + get + { + return false; + } + } + + public IEnumerator GetEnumerator() + { + return new BitArrayEnumeratorSimple(this); + } + + // XPerY=n means that n Xs can be stored in 1 Y. + private const int BitsPerInt32 = 32; + private const int BytesPerInt32 = 4; + private const int BitsPerByte = 8; + + /// + /// Used for conversion between different representations of bit array. + /// Returns (n+(div-1))/div, rearranged to avoid arithmetic overflow. + /// For example, in the bit to int case, the straightforward calc would + /// be (n+31)/32, but that would cause overflow. So instead it's + /// rearranged to ((n-1)/32) + 1, with special casing for 0. + /// + /// Usage: + /// GetArrayLength(77, BitsPerInt32): returns how many ints must be + /// allocated to store 77 bits. + /// + /// + /// use a conversion constant, e.g. BytesPerInt32 to get + /// how many ints are required to store n bytes + /// + private static int GetArrayLength(int n, int div) + { + Contract.Assert(div > 0, "GetArrayLength: div arg must be greater than 0"); + return n > 0 ? (((n - 1) / div) + 1) : 0; + } + + [Serializable] + private class BitArrayEnumeratorSimple : IEnumerator, ICloneable + { + private FastBitArray bitarray; + private int index; + private int version; + private bool currentElement; + + internal BitArrayEnumeratorSimple(FastBitArray bitarray) + { + this.bitarray = bitarray; + this.index = -1; + version = bitarray._version; + } + + public Object Clone() + { + return MemberwiseClone(); + } + + public virtual bool MoveNext() + { + if (version != bitarray._version) throw new InvalidOperationException("The version changed during enumeration"); + if (index < (bitarray.Count - 1)) + { + index++; + currentElement = bitarray.Get(index); + return true; + } + else + index = bitarray.Count; + return false; + } + + public virtual Object Current + { + get + { + if (index == -1) + throw new InvalidOperationException("Enumeration was not started"); + if (index >= bitarray.Count) + throw new InvalidOperationException("Enumeration was ended"); + return currentElement; + } + } + + public void Reset() + { + if (version != bitarray._version) throw new InvalidOperationException("The bit array was modified during enumeration."); + index = -1; + } + } + + private int[] m_array; + private int m_length; + private int _version; + + [NonSerialized] + private Object _syncRoot; + private const int _ShrinkThreshold = 256; + } +} diff --git a/ZeroLevel/Services/Extensions/BitArrayExtensions.cs b/ZeroLevel/Services/Extensions/BitArrayExtensions.cs new file mode 100644 index 0000000..55f0c62 --- /dev/null +++ b/ZeroLevel/Services/Extensions/BitArrayExtensions.cs @@ -0,0 +1,23 @@ +using System; +using ZeroLevel.Collections; + +namespace ZeroLevel.Extensions +{ + internal static class BitArrayExtensions + { + // + // serialize a bitarray. + // + //The bit array to convert + // The bit array converted to an array of bytes. + internal static byte[] ToBytes(this FastBitArray bits) + { + if (bits == null) return null; + var numBytes = bits.Count / 8; + if (bits.Count % 8 != 0) numBytes++; + var bytes = new byte[numBytes]; + bits.CopyTo(bytes, 0); + return bytes; + } + } +} diff --git a/ZeroLevel/Services/HashFunctions/IHash.cs b/ZeroLevel/Services/HashFunctions/IHash.cs new file mode 100644 index 0000000..f19afa6 --- /dev/null +++ b/ZeroLevel/Services/HashFunctions/IHash.cs @@ -0,0 +1,11 @@ +using System; + +namespace ZeroLevel.Services.HashFunctions +{ + public interface IHash + { + uint Hash(string s); + uint Hash(byte[] data); + uint Hash(byte[] data, int offset, uint len, uint seed); + } +} diff --git a/ZeroLevel/Services/HashFunctions/Murmur3Unsafe.cs b/ZeroLevel/Services/HashFunctions/Murmur3Unsafe.cs new file mode 100644 index 0000000..71339b7 --- /dev/null +++ b/ZeroLevel/Services/HashFunctions/Murmur3Unsafe.cs @@ -0,0 +1,99 @@ +using System; + +namespace ZeroLevel.Services.HashFunctions +{ + public class Murmur3Unsafe + : IHash + { + private const uint Seed = 0xc58f1a7b; + + private const UInt32 c1 = 0xcc9e2d51; + private const UInt32 c2 = 0x1b873593; + + public unsafe UInt32 Hash(string s) + { + fixed (char* input = s) + { + return Hash((byte*)input, (uint)s.Length * sizeof(char), Seed); + } + } + + public unsafe uint Hash(byte[] data) + { + fixed (byte* input = &data[0]) + { + return Hash(input, (uint)data.Length, Seed); + } + } + + public unsafe uint Hash(byte[] data, int offset, uint len, uint seed) + { + fixed (byte* input = &data[offset]) + { + return Hash(input, len, seed); + } + } + + private unsafe static uint Hash(byte* data, uint len, uint seed) + { + UInt32 nblocks = len / 4; + UInt32 h1 = seed; + + //---------- + // body + + UInt32 k1; + UInt32* block = (UInt32*)data; + for (UInt32 i = nblocks; i > 0; --i, ++block) + { + k1 = *block; + + k1 *= c1; + k1 = Rotl32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = Rotl32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } + + //---------- + // tail + + + k1 = 0; + uint rem = len & 3; + byte* tail = (byte*)block; + if (rem >= 3) + k1 ^= (uint)(tail[2] << 16); + if (rem >= 2) + k1 ^= (uint)(tail[1] << 8); + if (rem > 0) + { + k1 ^= tail[0]; + k1 *= c1; + k1 = Rotl32(k1, 15); + k1 *= c2; + h1 ^= k1; + } + + //---------- + // finalization + + h1 ^= len; + + h1 ^= h1 >> 16; + h1 *= 0x85ebca6b; + h1 ^= h1 >> 13; + h1 *= 0xc2b2ae35; + h1 ^= h1 >> 16; + + return h1; + } + + private static UInt32 Rotl32(UInt32 x, int r) + { + return (x << r) | (x >> (32 - r)); + } + } +} diff --git a/ZeroLevel/Services/HashFunctions/Utils.cs b/ZeroLevel/Services/HashFunctions/Utils.cs new file mode 100644 index 0000000..8c9268a --- /dev/null +++ b/ZeroLevel/Services/HashFunctions/Utils.cs @@ -0,0 +1,143 @@ +using System; +using System.Buffers.Binary; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace ZeroLevel.Services.HashFunctions +{ + internal static class Utils + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ReadOnlySpan PopAll(this ref ReadOnlySpan @this) where TTo : struct + { +#if NETCOREAPP3_0 + var totBytes = @this.Length; + var toLength = (totBytes / Unsafe.SizeOf()); + var sliceLength = toLength * Unsafe.SizeOf(); + ref var thisRef = ref MemoryMarshal.GetReference(@this); + @this = MemoryMarshal.CreateReadOnlySpan(ref Unsafe.Add(ref thisRef, sliceLength), totBytes - sliceLength); + return MemoryMarshal.CreateReadOnlySpan(ref Unsafe.As(ref thisRef), toLength); +#else + return @this.PopAll(); +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ReadOnlySpan PopAll(this ref ReadOnlySpan @this) where TFrom : struct where TTo : struct + { + var totBytes = @this.Length * Unsafe.SizeOf(); + var toLength = (totBytes / Unsafe.SizeOf()); + var sliceLength = toLength * Unsafe.SizeOf() / Unsafe.SizeOf(); + +#if NETSTANDARD2_0 + var result = MemoryMarshal.Cast(@this); +#else + var result = MemoryMarshal.CreateReadOnlySpan(ref Unsafe.As(ref MemoryMarshal.GetReference(@this)), toLength); +#endif + @this = @this.Slice(sliceLength); + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static uint AsLittleEndian(this uint @this) + { + if (BitConverter.IsLittleEndian) { return @this; } + return BinaryPrimitives.ReverseEndianness(@this); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ulong AsLittleEndian(this ulong @this) + { + if (BitConverter.IsLittleEndian) { return @this; } + return BinaryPrimitives.ReverseEndianness(@this); + } + + public static bool TryPop(this ref ReadOnlySpan @this, int count, out ReadOnlySpan popped) where TTo : struct + { + var byteCount = count * Unsafe.SizeOf(); + if (@this.Length >= byteCount) + { + popped = MemoryMarshal.Cast(@this.Slice(0, byteCount)); + @this = @this.Slice(byteCount); + return true; + } + popped = default; + return false; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ref readonly TTo First(this ReadOnlySpan @this) where TTo : struct + { + return ref MemoryMarshal.Cast(@this)[0]; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ref readonly TTo Last(this ReadOnlySpan @this) where TTo : struct + { + return ref MemoryMarshal.Cast(@this.Slice(@this.Length - Unsafe.SizeOf()))[0]; + } + + public static ref readonly TTo First(this ReadOnlySpan @this) where TTo : struct where TFrom : struct + { +#if NETSTANDARD2_0 + return ref MemoryMarshal.Cast(@this)[0]; +#else + //TODO: is this version actually any faster/better at all? + return ref MemoryMarshal.AsRef(MemoryMarshal.AsBytes(@this)); +#endif + } + } + + public static class Safeish + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ref readonly TTo As(in TFrom from) where TTo : struct where TFrom : struct + { + if (Unsafe.SizeOf() < Unsafe.SizeOf()) { throw new InvalidCastException(); } + return ref Unsafe.As(ref Unsafe.AsRef(from)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ref TTo AsMut(ref TFrom from) where TTo : struct where TFrom : struct + { + if (Unsafe.SizeOf() < Unsafe.SizeOf()) { throw new InvalidCastException(); } + return ref Unsafe.As(ref from); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ReadOnlySpan AsSpan(in TFrom from) where TTo : struct where TFrom : struct + { +#if NETSTANDARD2_0 + var asSpan = CreateReadOnlySpan(ref Unsafe.AsRef(from)); +#else + var asSpan = MemoryMarshal.CreateReadOnlySpan(ref Unsafe.AsRef(from), 1); +#endif + return MemoryMarshal.Cast(asSpan); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Span AsMutableSpan(ref TFrom from) where TTo : struct where TFrom : struct + { +#if NETSTANDARD2_0 + var asSpan = CreateSpan(ref Unsafe.AsRef(from)); +#else + var asSpan = MemoryMarshal.CreateSpan(ref from, 1); +#endif + return MemoryMarshal.Cast(asSpan); + } + +#if NETSTANDARD2_0 + private static unsafe Span CreateSpan(ref T from) where T : struct + { + void* ptr = Unsafe.AsPointer(ref from); + return new Span(ptr, 1); + } + + private static unsafe ReadOnlySpan CreateReadOnlySpan(ref T from) where T : struct + { + void* ptr = Unsafe.AsPointer(ref from); + return new ReadOnlySpan(ptr, 1); + } +#endif + } +} diff --git a/ZeroLevel/Services/HashFunctions/XXH3_64.cs b/ZeroLevel/Services/HashFunctions/XXH3_64.cs new file mode 100644 index 0000000..ef11770 --- /dev/null +++ b/ZeroLevel/Services/HashFunctions/XXH3_64.cs @@ -0,0 +1,145 @@ +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text; + +namespace ZeroLevel.Services.HashFunctions +{ + public static class XXH3_64 + { + private const ulong PRIME64_1 = 11400714785074694791UL; + private const ulong PRIME64_2 = 14029467366897019727UL; + private const ulong PRIME64_3 = 1609587929392839161UL; + private const ulong PRIME64_4 = 9650029242287828579UL; + private const ulong PRIME64_5 = 2870177450012600261UL; + + [StructLayout(LayoutKind.Sequential)] + private struct QuadUlong + { + public ulong v1; + public ulong v2; + public ulong v3; + public ulong v4; + } + + public static ulong Hash(string line) + { + return Hash(new ReadOnlySpan(Encoding.UTF8.GetBytes(line))); + } + + public static ulong Hash(in ReadOnlySpan buffer) + { + unchecked + { + var remainingBytes = buffer; + var bulkVals = remainingBytes.PopAll(); + + var h64 = !bulkVals.IsEmpty ? BulkStride(bulkVals) : PRIME64_5; + + h64 += (uint)buffer.Length; + + var ulongSpan = remainingBytes.PopAll(); + for (int i = 0; i < ulongSpan.Length; i++) + { + var val = ulongSpan[i] * PRIME64_2; + val = RotateLeft(val, 31); + val *= PRIME64_1; + h64 ^= val; + h64 = RotateLeft(h64, 27) * PRIME64_1; + h64 += PRIME64_4; + } + + ref byte remaining = ref MemoryMarshal.GetReference(remainingBytes); + if (remainingBytes.Length >= sizeof(uint)) + { + h64 ^= Unsafe.As(ref remaining) * PRIME64_1; + h64 = RotateLeft(h64, 23) * PRIME64_2; + h64 += PRIME64_3; + remaining = ref Unsafe.Add(ref remaining, sizeof(uint)); + } + + switch (remainingBytes.Length % sizeof(uint)) + { + case 3: + h64 = RotateLeft(h64 ^ remaining * PRIME64_5, 11) * PRIME64_1; + remaining = ref Unsafe.Add(ref remaining, 1); + goto case 2; + case 2: + h64 = RotateLeft(h64 ^ remaining * PRIME64_5, 11) * PRIME64_1; + remaining = ref Unsafe.Add(ref remaining, 1); + goto case 1; + case 1: + h64 = RotateLeft(h64 ^ remaining * PRIME64_5, 11) * PRIME64_1; + break; + } + + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static ulong BulkStride(in ReadOnlySpan bulkVals) + { + unchecked + { + ulong acc1 = 0 + PRIME64_1 + PRIME64_2; + ulong acc2 = 0 + PRIME64_2; + ulong acc3 = 0 + 0; + ulong acc4 = 0 - PRIME64_1; + + for (int i = 0; i < bulkVals.Length; i++) + { + ref readonly QuadUlong val = ref bulkVals[i]; + + acc1 += val.v1 * PRIME64_2; + acc2 += val.v2 * PRIME64_2; + acc3 += val.v3 * PRIME64_2; + acc4 += val.v4 * PRIME64_2; + + acc1 = RotateLeft(acc1, 31); + acc2 = RotateLeft(acc2, 31); + acc3 = RotateLeft(acc3, 31); + acc4 = RotateLeft(acc4, 31); + + acc1 *= PRIME64_1; + acc2 *= PRIME64_1; + acc3 *= PRIME64_1; + acc4 *= PRIME64_1; + } + + return MergeValues(acc1, acc2, acc3, acc4); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong RotateLeft(ulong val, int bits) => (val << bits) | (val >> (64 - bits)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong MergeValues(ulong v1, ulong v2, ulong v3, ulong v4) + { + var acc = RotateLeft(v1, 1) + RotateLeft(v2, 7) + RotateLeft(v3, 12) + RotateLeft(v4, 18); + acc = MergeAccumulator(acc, v1); + acc = MergeAccumulator(acc, v2); + acc = MergeAccumulator(acc, v3); + acc = MergeAccumulator(acc, v4); + return acc; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong MergeAccumulator(ulong accMain, ulong accN) + { + accN = (accN * PRIME64_2); + accN = RotateLeft(accN, 31); + accN = accN * PRIME64_1; + accMain ^= accN; + accMain *= PRIME64_1; + return accMain + PRIME64_4; + } + } +} diff --git a/ZeroLevel/Services/HashFunctions/XXHashUnsafe.cs b/ZeroLevel/Services/HashFunctions/XXHashUnsafe.cs new file mode 100644 index 0000000..62772c2 --- /dev/null +++ b/ZeroLevel/Services/HashFunctions/XXHashUnsafe.cs @@ -0,0 +1,124 @@ +using System; + +namespace ZeroLevel.Services.HashFunctions +{ + public class XXHashUnsafe + : IHash + { + private const uint Seed = 0xc58f1a7b; + + private const uint PRIME1 = 2654435761U; + private const uint PRIME2 = 2246822519U; + private const uint PRIME3 = 3266489917U; + private const uint PRIME4 = 668265263U; + private const int PRIME5 = 0x165667b1; + + private uint _bias; + + public XXHashUnsafe(uint bias = 0) => _bias = bias; + + public unsafe UInt32 Hash(string s) + { + fixed (char* input = s) + { + return Hash((byte*)input, (uint)s.Length * sizeof(char), Seed, _bias); + } + } + + public unsafe uint Hash(byte[] data) + { + fixed (byte* input = &data[0]) + { + return Hash(input, (uint)data.Length, Seed, _bias); + } + } + + public unsafe uint Hash(byte[] data, int offset, uint len, uint seed) + { + fixed (byte* input = &data[offset]) + { + return Hash(input, len, seed, _bias); + } + } + + private unsafe static uint Hash(byte* data, uint len, uint seed, uint bias) + { + if (len < 16) + return HashSmall(data, len, seed, bias); + + uint v1 = seed + PRIME1 + bias; + uint v2 = v1 * PRIME2 + len; + uint v3 = v2 * PRIME3; + uint v4 = v3 * PRIME4; + + uint* p = (uint*)data; + uint* limit = (uint*)(data + len - 16); + + while (p < limit) + { + v1 += Rotl32(v1, 13); v1 *= PRIME1; v1 += *p; p++; + v2 += Rotl32(v2, 11); v2 *= PRIME1; v2 += *p; p++; + v3 += Rotl32(v3, 17); v3 *= PRIME1; v3 += *p; p++; + v4 += Rotl32(v4, 19); v4 *= PRIME1; v4 += *p; p++; + } + + p = limit; + v1 += Rotl32(v1, 17); v2 += Rotl32(v2, 19); v3 += Rotl32(v3, 13); v4 += Rotl32(v4, 11); + v1 *= PRIME1; v2 *= PRIME1; v3 *= PRIME1; v4 *= PRIME1; + v1 += *p; p++; v2 += *p; p++; v3 += *p; p++; v4 += *p; + v1 *= PRIME2; v2 *= PRIME2; v3 *= PRIME2; v4 *= PRIME2; + v1 += Rotl32(v1, 11); v2 += Rotl32(v2, 17); v3 += Rotl32(v3, 19); v4 += Rotl32(v4, 13); + v1 *= PRIME3; v2 *= PRIME3; v3 *= PRIME3; v4 *= PRIME3; + + uint crc = v1 + Rotl32(v2, 3) + Rotl32(v3, 6) + Rotl32(v4, 9); + crc ^= crc >> 11; + crc += (PRIME4 + len) * PRIME1; + crc ^= crc >> 15; + crc *= PRIME2; + crc ^= crc >> 13; + return crc; + } + + private unsafe static uint HashSmall(byte* data, uint len, uint seed, uint bias) + { + byte* p = data; + byte* bEnd = data + len; + byte* limit = bEnd - 4; + + uint idx = seed + PRIME1 + bias; + uint crc = PRIME5; + + while (p < limit) + { + crc += (*(uint*)p) + idx; + idx++; + crc += Rotl32(crc, 17) * PRIME4; + crc *= PRIME1; + p += 4; + } + + while (p < bEnd) + { + crc += (*p) + idx; + idx++; + crc *= PRIME1; + p++; + } + + crc += len; + + crc ^= crc >> 15; + crc *= PRIME2; + crc ^= crc >> 13; + crc *= PRIME3; + crc ^= crc >> 16; + + return crc; + } + + private static UInt32 Rotl32(UInt32 x, int r) + { + return (x << r) | (x >> (32 - r)); + } + } +} diff --git a/ZeroLevel/Services/Semantic/CValue/Token.cs b/ZeroLevel/Services/Semantic/CValue/Token.cs index 8af34e4..3acf5d9 100644 --- a/ZeroLevel/Services/Semantic/CValue/Token.cs +++ b/ZeroLevel/Services/Semantic/CValue/Token.cs @@ -20,7 +20,7 @@ namespace ZeroLevel.Services.Semantic.CValue wordForm = pWordForm; posTag = pPostag; } - + public Token(String pWordForm, String pPostag, String pLemma) { wordForm = pWordForm; @@ -35,7 +35,7 @@ namespace ZeroLevel.Services.Semantic.CValue lemma = pLemma; chunkerTag = pChunker; } - + public String getWordForm() { return wordForm; @@ -81,3 +81,4 @@ namespace ZeroLevel.Services.Semantic.CValue this.chunkerTag = chunkerTag; } } +} diff --git a/ZeroLevel/Services/Semantic/Fasttext/FTDictionary.cs b/ZeroLevel/Services/Semantic/Fasttext/FTDictionary.cs index 49d1893..551262f 100644 --- a/ZeroLevel/Services/Semantic/Fasttext/FTDictionary.cs +++ b/ZeroLevel/Services/Semantic/Fasttext/FTDictionary.cs @@ -12,7 +12,7 @@ namespace ZeroLevel.Services.Semantic.Fasttext public entry_type type; public List subwords; } - + /* internal class FTDictionary { const int MAX_VOCAB_SIZE = 30000000; @@ -484,4 +484,5 @@ namespace ZeroLevel.Services.Semantic.Fasttext return ntokens; } } + */ } diff --git a/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs b/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs index 7b17026..f523a9e 100644 --- a/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs +++ b/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Linq; using System.Threading; using System.Threading.Tasks; +using ZeroLevel.DataStructures; using ZeroLevel.Implementation.Semantic.Helpers; using ZeroLevel.Services.Serialization; diff --git a/ZeroLevel/Services/Semantic/LongestCommonSubstring.cs b/ZeroLevel/Services/Semantic/Helpers/LongestCommonSubstring.cs similarity index 100% rename from ZeroLevel/Services/Semantic/LongestCommonSubstring.cs rename to ZeroLevel/Services/Semantic/Helpers/LongestCommonSubstring.cs diff --git a/ZeroLevel/Services/Semantic/Helpers/TextDistance.cs b/ZeroLevel/Services/Semantic/Helpers/TextDistance.cs new file mode 100644 index 0000000..8c5dc1e --- /dev/null +++ b/ZeroLevel/Services/Semantic/Helpers/TextDistance.cs @@ -0,0 +1,155 @@ +namespace ZeroLevel.Services.Semantic.Helpers +{ + public static class TextDistance + { + private static int MinOf3(int a, int b, int c) + { + if (a < b) + { + if (b < c) return a; + if (c < a) return c; + else return a; + } + if (c < b) return c; + else return b; + } + + /// + /// Computes the Levenshtein distance between two strings. + /// + /// The first . + /// The second . + /// The edit distiance between the given objets. + public static int LevenshteinDistance(string s1, string s2) + { + // Null or empty checks + if (string.IsNullOrEmpty(s1)) + { + if (string.IsNullOrEmpty(s2)) + return 0; + else + return s2.Length; + } + if (string.IsNullOrEmpty(s2)) return s1.Length; + + // Faster access + int s1Length = s1.Length; + int s2Length = s2.Length; + + // Create two rows for computation. We don't need reconstruction so a full matrix isn't needed + var rows = new int[2][]; + rows[0] = new int[s2Length + 1]; + rows[1] = new int[s2Length + 1]; + + // Initialize first row + for (int i = 0; i <= s2Length; i++) + rows[0][i] = i; + // Row for computation + int curRow = 1; + for (int i = 0; i < s1Length; i++) + { + // Calculate first index in current row for computation + rows[curRow][0] = i + 1; + int prevRow = curRow ^ 1; + // Calculate rest of the row + for (int j = 1; j <= s2Length; j++) + { + int cost = s1[i] == s2[j - 1] ? 0 : 1; + rows[curRow][j] = MinOf3( + rows[prevRow][j] + 1, // deletion + rows[curRow][j - 1] + 1, // insertion + rows[prevRow][j - 1] + cost); // substitution + } + // Change row for computation to the next. + curRow = i & 1; + } + return rows[curRow ^ 1][s2Length]; + } + + /// + /// Computes the Damerau-Levenshtein distance between two strings. + /// + /// The first . + /// The second . + /// The edit distiance between the given objets. + public static int DamerauLevenshteinDistance(string s1, string s2) + { + // Null or empty checks + if (string.IsNullOrEmpty(s1)) + { + if (string.IsNullOrEmpty(s2)) + return 0; + else + return s2.Length; + } + if (string.IsNullOrEmpty(s2)) return s1.Length; + + // Faster access + int s1Length = s1.Length; + int s2Length = s2.Length; + + // Create three rows for computation. We don't need reconstruction so a full matrix isn't needed + var rows = new int[3][]; + rows[0] = new int[s2Length + 1]; + rows[1] = new int[s2Length + 1]; + rows[2] = new int[s2Length + 1]; + + // Initialize first row + for (int i = 0; i <= s2Length; i++) + rows[0][i] = i; + + // Define rows + int transRow = -1; + int prevRow = 0; + int curRow = 1; + + for (int i = 1; i <= s1Length; i++) + { + // Calculate first index in current row for computation + rows[curRow][0] = i; + + // Calculate rest of the row + for (int j = 1; j <= s2Length; j++) + { + int cost = s1[i - 1] == s2[j - 1] ? 0 : 1; + rows[curRow][j] = MinOf3( + rows[prevRow][j] + 1, // deletion + rows[curRow][j - 1] + 1, // insertion + rows[prevRow][j - 1] + cost); // substitution + + if (i > 1 && j > 1 && s1[i - 1] == s2[j - 2] && s1[i - 2] == s2[j - 1]) + { + // Transposition + int curVal = rows[curRow][j]; + int transVal = rows[transRow][j - 2] + cost; + rows[curRow][j] = curVal < transVal ? curVal : transVal; + } + } + + // Update rows + switch (curRow) + { + case 0: + curRow = 1; + prevRow = 0; + transRow = 2; + break; + case 1: + curRow = 2; + prevRow = 1; + transRow = 0; + break; + case 2: + curRow = 0; + prevRow = 2; + transRow = 1; + break; + default: + break; + } + } + + return rows[prevRow][s2Length]; + } + } +} diff --git a/ZeroLevel/Services/Semantic/Model/Symbol.cs b/ZeroLevel/Services/Semantic/Model/Symbol.cs new file mode 100644 index 0000000..e27921e --- /dev/null +++ b/ZeroLevel/Services/Semantic/Model/Symbol.cs @@ -0,0 +1,85 @@ +using System.Collections.Generic; + +namespace ZeroLevel.Services.Semantic.Model +{ + public class Symbol + { + internal static char[] _map_ind_ch = new char[64] { 'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', '.', ',', '!', '?' }; + + internal static Dictionary _map_ch_ind = new Dictionary(64) + { + {'а', 0}, {'б', 1}, {'в', 2}, {'г', 3}, {'д', 4}, + {'е', 5}, {'ё', 6}, {'ж', 7}, {'з', 8}, {'и', 9}, // 10 + {'й', 10}, {'к', 11}, {'л', 12}, {'м', 13}, {'н', 14}, + {'о', 15}, {'п', 16}, {'р', 17}, {'с', 18}, {'т', 19}, // 20 + {'у', 20}, {'ф', 21}, {'х', 22}, {'ц', 23}, {'ч', 24}, + {'ш', 25}, {'щ', 26}, {'ъ', 27}, {'ы', 28}, {'ь', 29}, // 30 + {'э', 30}, {'ю', 31}, {'я', 32}, + + {'a', 33}, {'b', 34}, {'c', 35}, {'d', 36}, {'e', 37}, // 38 + {'f', 38}, {'g', 39}, {'h', 40}, {'i', 41}, {'j', 42}, + {'k', 43}, {'l', 44}, {'m', 45}, {'n', 46}, {'o', 47}, // 48 + {'p', 48}, {'q', 49}, {'r', 50}, {'s', 51}, {'t', 52}, + {'u', 53}, {'v', 54}, {'w', 55}, {'x', 56}, {'y', 57}, // 58 + {'z', 58 }, + + { '-', 59}, {'.', 60}, {',', 61}, {'!', 62}, {'?', 63} + }; + + const byte TERMINATE_FLAG = 1; + const byte HAS_NEXT_FLAG = 2; + + public static byte ToByte(char ch, bool is_leaf = false, bool has_next = false) + { + byte b = 0; + if (_map_ch_ind.TryGetValue(ch, out b)) + { + b <<= 2; + if (is_leaf) b |= TERMINATE_FLAG; + if (has_next) b |= HAS_NEXT_FLAG; + } + return b; + } + + public static bool IsTermiate(byte sym) + { + return (sym & TERMINATE_FLAG) == TERMINATE_FLAG; + } + + public static bool IsLeaf(byte sym) + { + return (sym & HAS_NEXT_FLAG) == HAS_NEXT_FLAG; + } + + public static char ToChar(byte sym) + { + var ind = sym >> 2; + if (ind >= 0 && ind < 64) return _map_ind_ch[ind]; + return '\0'; + } + } + + public class Node + { + private byte Sym; + + private Node NextSibling; + + private Node NextChild; + + public void Append(string line, int position) + { + + } + } + + public class Tree + { + private Node _root; + + public void Append(string word) + { + _root.Append(word, 0); + } + } +} diff --git a/ZeroLevel/Services/Semantic/Search/BoyerMoore.cs b/ZeroLevel/Services/Semantic/Search/BoyerMoore.cs new file mode 100644 index 0000000..3912949 --- /dev/null +++ b/ZeroLevel/Services/Semantic/Search/BoyerMoore.cs @@ -0,0 +1,171 @@ +using System; +using System.Collections.Generic; + +namespace ZeroLevel.Services.Semantic.Helpers +{ + public static class BoyerMoore + { + /// + /// Makes the bad char table. Contains the distance between the last character of the pattern and the rightmost occurrence of the character. + /// + private static Dictionary BuildBadCharacterTable(string pattern) + { + var badCharTable = new Dictionary(); + int patLength = pattern.Length; + + for (int i = 0; i < patLength - 1; i++) + { + badCharTable[pattern[i]] = patLength - 1 - i; + } + + return badCharTable; + } + + /// + /// Searches for the first occurrence of a pattern in a target using Boyer-Moore's algorithm. + /// + /// The to search in. + /// The to search for. + /// Returns the position of the first occurrence of the pattern. If not found returns -1. + public static int BoyerMooreSearchFirst(string target, string pattern) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (pattern == null) throw new ArgumentNullException(nameof(pattern)); + + // Build tables + var badCharTable = BuildBadCharacterTable(pattern); + + // Faster access + int patternLength = pattern.Length; + int targetLength = target.Length; + int endOfSearch = targetLength - patternLength; + + int i = 0; + while (i <= endOfSearch) + { + // Start mathing + int j = patternLength - 1; + while (j >= 0 && target[i + j] == pattern[j]) + { + j--; + } + + if (j < 0) + return i; // found a match + + // If we didn't find a match advance to next position + int badChar = badCharTable.ContainsKey(target[i + j]) ? badCharTable[target[i + j]] : 0; + int offset = badChar - patternLength + 1 + j; + i += 1 < offset ? offset : 1; + } + + // We haven't found anything + return -1; + } + + /// + /// Searches for all occurences of a pattern in a target using Boyer-Moore's algorithm. + /// + /// The to search in. + /// The to search for. + /// Returns of values of the positions at which the pattern occurs. is empty if none found. + public static IList BoyerMooreSearchAll(string target, string pattern) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (pattern == null) throw new ArgumentNullException(nameof(pattern)); + + // List with matches + var matches = new List(); + + // Build tables + var badCharTable = BuildBadCharacterTable(pattern); + + // Faster access + int patternLength = pattern.Length; + int targetLength = target.Length; + int endOfSearch = targetLength - patternLength; + + int i = 0; + while (i <= endOfSearch) + { + int j = patternLength - 1; + while (j >= 0 && target[i + j] == pattern[j]) + { + j--; + } + + if (j < 0) + { + matches.Add(i); // found a match + + // Compute next position to start matching again + if (i + patternLength < targetLength) + { + int badChar = badCharTable.ContainsKey(target[i + patternLength]) ? badCharTable[target[i + patternLength]] : 0; + i += badChar + 1; + } + else i++; + } + else + { + // If we didn't find a match advance to next position + int badChar = badCharTable.ContainsKey(target[i + j]) ? badCharTable[target[i + j]] : 0; + int offset = badChar - patternLength + 1 + j; + i += 1 < offset ? offset : 1; + } + } + + return matches; + } + + /// + /// Searches for the first occurrence of multiple patterns in a target using Boyer-Moore's algorithm. + /// + /// The to search in. + /// A of patterns. + /// Retruns with keys of the patterns and values of the position of first occurence. + /// If a pattern is not found there is no entry in the dictionary. + public static Dictionary BoyerMooreMultipleSearchFirst(string target, IList patterns) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (patterns == null) throw new ArgumentNullException(nameof(patterns)); + + // Dictionary with matches + var matches = new Dictionary(); + + for (int i = 0; i < patterns.Count; i++) + { + int postition = BoyerMooreSearchFirst(target, patterns[i]); + if (postition > -1) + matches.Add(patterns[i], postition); + } + + return matches; + } + + /// + /// Searches for all occurrences of multiple patterns in a target using Boyer-Moore's algorithm. + /// + /// The to search in. + /// A of patterns. + /// Retruns with keys of the patterns and of values of the positions at which the pattern occurs. + /// If a pattern is not found there is no entry in the dictionary. + public static Dictionary> BoyerMooreMultipleSearchAll(string target, IList patterns) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (patterns == null) throw new ArgumentNullException(nameof(patterns)); + + // Dictionary with matches + var matches = new Dictionary>(); + + for (int i = 0; i < patterns.Count; i++) + { + var postitions = new List(BoyerMooreSearchAll(target, patterns[i])); + if (postitions.Count > 0) + matches.Add(patterns[i], postitions); + } + + return matches; + } + } +} diff --git a/ZeroLevel/Services/Semantic/Search/KnuthMorrisPratt.cs b/ZeroLevel/Services/Semantic/Search/KnuthMorrisPratt.cs new file mode 100644 index 0000000..20c6121 --- /dev/null +++ b/ZeroLevel/Services/Semantic/Search/KnuthMorrisPratt.cs @@ -0,0 +1,215 @@ +using System; +using System.Collections.Generic; + +namespace ZeroLevel.Services.Semantic.Helpers +{ + public static class KnuthMorrisPratt + { + private static int[] BuildKMPTable(string pattern) + { + var kmpTable = new int[pattern.Length]; + + if (kmpTable.Length < 2) + { + if (kmpTable.Length > 0) + kmpTable[0] = -1; + + return kmpTable; + } + + int tableIndex = 2; // current position in table for computation + int patSubstrIndex = 0; // index in the pattern of the current substring + + // First two values are fixed -1 and 0 + kmpTable[0] = -1; + + // Build table + while (tableIndex < kmpTable.Length) + { + // If the substring continues + if (pattern[tableIndex - 1] == pattern[patSubstrIndex]) + { + kmpTable[tableIndex++] = ++patSubstrIndex; + } + // It does not but we can fall back + else if (patSubstrIndex != 0) + { + patSubstrIndex = kmpTable[patSubstrIndex]; + } + // If we ran out of candidates + else + { + kmpTable[tableIndex++] = 0; + } + } + + return kmpTable; + } + + /// + /// Searches for the first occurrence of a pattern in a target using Knuth–Morris–Pratt's algorithm. + /// + /// The to search in. + /// The to search for. + /// Returns the position of the first occurrence of the pattern. If not found returns -1. + public static int KnuthMorrisPrattSearchFirst(string target, string pattern) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (pattern == null) throw new ArgumentNullException(nameof(pattern)); + + // Build KMP table + var kmpTable = BuildKMPTable(pattern); + + int matchIndex = 0; // position of the current match + int patternIndex = 0; // position in the pattern + // Save for faster access + int targetLength = target.Length; + int patternLength = pattern.Length; + + while (matchIndex + patternIndex < targetLength) + { + if (pattern[patternIndex] == target[matchIndex + patternIndex]) + { + patternIndex++; + if (patternIndex == patternLength) + return matchIndex; + } + else // we are not in the middle of a pattern + { + // if we can backtrack + if (kmpTable[patternIndex] > -1) + { + matchIndex = matchIndex + patternIndex - kmpTable[patternIndex]; + patternIndex = kmpTable[patternIndex]; + } + else // we can't backtrack (the beginning of the word) + { + matchIndex++; + patternIndex = 0; + } + } + } + + // We haven't found anything + return -1; + } + + /// + /// Searches for all occurences of a pattern in a target using Knuth–Morris–Pratt's algorithm. + /// + /// The to search in. + /// The to search for. + /// Returns of values of the positions at which the pattern occurs. is empty if none found. + public static IList KnuthMorrisPrattSearchAll(string target, string pattern) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (pattern == null) throw new ArgumentNullException(nameof(pattern)); + + // List with matches + var matches = new List(); + + // Build KMP table + var kmpTable = BuildKMPTable(pattern); + + int matchIndex = 0; // position of the current match + int patternIndex = 0; // position in the pattern + // Save for faster access + int targetLength = target.Length; + int patternLength = pattern.Length; + + while (matchIndex + patternIndex < targetLength) + { + if (pattern[patternIndex] == target[matchIndex + patternIndex]) + { + patternIndex++; + if (patternIndex == patternLength) + { + matches.Add(matchIndex); + + // Find where the next match will begin + patternIndex--; + + // if we can backtrack + if (kmpTable[patternIndex] > -1) + { + matchIndex = matchIndex + patternIndex - kmpTable[patternIndex]; + patternIndex = kmpTable[patternIndex]; + } + else // we can't backtrack (the beginning of the word) + { + matchIndex++; + patternIndex = 0; + } + } + } + else // we are not in the middle of a pattern + { + // if we can backtrack + if (kmpTable[patternIndex] > -1) + { + matchIndex = matchIndex + patternIndex - kmpTable[patternIndex]; + patternIndex = kmpTable[patternIndex]; + } + else // we can't backtrack (the beginning of the word) + { + matchIndex++; + patternIndex = 0; + } + } + } + + // We haven't found anything + return matches; + } + + /// + /// Searches for the first occurrence of multiple patterns in a target using Knuth–Morris–Pratt's algorithm. + /// + /// The to search in. + /// A of patterns. + /// Retruns with keys of the patterns and values of the position of first occurence. + /// If a pattern is not found there is no entry in the dictionary. + public static Dictionary KnuthMorrisPrattMultipleSearchFirst(string target, IList patterns) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (patterns == null) throw new ArgumentNullException(nameof(patterns)); + + // Dictionary with matches + var matches = new Dictionary(); + + for (int i = 0; i < patterns.Count; i++) + { + int postition = KnuthMorrisPrattSearchFirst(target, patterns[i]); + if (postition > -1) + matches.Add(patterns[i], postition); + } + + return matches; + } + + /// + /// Searches for all occurrences of multiple patterns in a target using Knuth–Morris–Pratt's algorithm. + /// + /// The to search in. + /// A of patterns. + /// Retruns with keys of the patterns and of values of the positions at which the pattern occurs. + /// If a pattern is not found there is no entry in the dictionary. + public static Dictionary> KnuthMorrisPrattMultipleSearchAll(string target, IList patterns) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (patterns == null) throw new ArgumentNullException(nameof(patterns)); + + // Dictionary with matches + var matches = new Dictionary>(); + + for (int i = 0; i < patterns.Count; i++) + { + var postitions = new List(KnuthMorrisPrattSearchAll(target, patterns[i])); + if (postitions.Count > 0) + matches.Add(patterns[i], postitions); + } + + return matches; + } + } +} diff --git a/ZeroLevel/Services/Semantic/Search/RabinKarp.cs b/ZeroLevel/Services/Semantic/Search/RabinKarp.cs new file mode 100644 index 0000000..31d7eca --- /dev/null +++ b/ZeroLevel/Services/Semantic/Search/RabinKarp.cs @@ -0,0 +1,345 @@ +using System; +using System.Collections.Generic; + +namespace ZeroLevel.Services.Semantic.Helpers +{ + public static class RabinKarp + { + /// + /// Searches for the first occurrence of a pattern in a target using Rabin-Karp's algorithm. + /// + /// The to search in. + /// The to search for. + /// Returns the position of the first occurrence of the pattern. If not found returns -1. + public static int RabinKarpSearchFirst(string target, string pattern) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (pattern == null) throw new ArgumentNullException(nameof(pattern)); + + // Save for faster access + int patternLength = pattern.Length; + + if (target.Length < patternLength) return -1; + + ulong targetHash = 0; + ulong patternHash = 0; + ulong alphabetSize = 256; // max char value + ulong moduloValue = 65537; // custom selected prime number for the hashing + + // Calculating hash of pattern and the beggining of target + for (int i = 0; i < patternLength; i++) + { + patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue; + targetHash = (targetHash * alphabetSize + target[i]) % moduloValue; + } + + // Check if pattern is in the beginning + if (patternHash == targetHash) + if (string.Equals(target.Substring(0, patternLength), pattern)) + return 0; + + // Calculate pow value (used in the hashing proccess) + ulong pow = 1; + for (int i = 0; i < patternLength - 1; i++) + { + pow = (pow * alphabetSize) % moduloValue; + } + + // Hashing the rest of the target and searching for the pattern + int endOfSearch = target.Length - patternLength; + + for (int i = 0; i < endOfSearch; i++) + { + // Some Rabin-Karp magic + targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue; + targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue; + + // If the hashes are equal check the string( because collisions are possible) and return if found + if (targetHash == patternHash) + if (string.Equals(target.Substring(i + 1, patternLength), pattern)) + return i + 1; + } + + // The pattern was not found + return -1; + } + + /// + /// Searches for all occurences of a pattern in a target using Rabin-Karp's algorithm. + /// + /// The to search in. + /// The to search for. + /// Returns of values of the positions at which the pattern occurs. is empty if none found. + public static IList RabinKarpSearchAll(string target, string pattern) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (pattern == null) throw new ArgumentNullException(nameof(pattern)); + + // Save for faster access + int patternLength = pattern.Length; + + // List with the positions where the pattern was found + var matches = new List(); + + if (target.Length < patternLength) return matches; + + ulong targetHash = 0; + ulong patternHash = 0; + ulong alphabetSize = 256; // max char value + ulong moduloValue = 65537; // custom selected prime number for the hashing + + // Calculating hash of pattern and the beggining of target + for (int i = 0; i < patternLength; i++) + { + patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue; + targetHash = (targetHash * alphabetSize + target[i]) % moduloValue; + } + + // Check if pattern is in the beginning + if (patternHash == targetHash) + if (string.Equals(target.Substring(0, patternLength), pattern)) + matches.Add(0); + + // Calculate pow value (used in the hashing proccess) + ulong pow = 1; + for (int i = 0; i < patternLength - 1; i++) + { + pow = (pow * alphabetSize) % moduloValue; + } + + // Hashing the rest of the target and searching for the pattern + int endOfSearch = target.Length - patternLength; + + for (int i = 0; i < endOfSearch; i++) + { + // Some Rabin-Karp magic + targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue; + targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue; + + // If the hashes are equal check the string( because collisions are possible) and return if found + if (targetHash == patternHash) + if (string.Equals(target.Substring(i + 1, patternLength), pattern)) + matches.Add(i + 1); + } + + // Retrun the list with all starting positions of the pattern + return matches; + } + + /// + /// Searches for the first occurrence of multiple patterns in a target using Rabin-Karp's algorithm. + /// + /// The to search in. + /// A of patterns. + /// Retruns with keys of the patterns and values of the position of first occurence. + /// If a pattern is not found there is no entry in the dictionary. + public static Dictionary RabinKarpMultipleSearchFirst(string target, IList patterns) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (patterns == null) throw new ArgumentNullException(nameof(patterns)); + + // Dictionary with pattern hashes for all strings + var patternHashes = new Dictionary(); + // Dictionary with target hashes for all different string lengths + var targetHashes = new Dictionary(); + // Dictionary with pow values for all different string lengths + var pows = new Dictionary(); + // Dictionary with all strings with a specific length + var patternLengths = new Dictionary>(); + // Dictionary with found positions for every string + var matches = new Dictionary(); + + ulong alphabetSize = 256; // max char value + ulong moduloValue = 65537; // custom selected prime number for the hashing + + // Calculating hash of patterns and all target hashes and pow values + for (int i = 0; i < patterns.Count; i++) + { + // Chech if target hash for current string length has to be computed + bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length); + + // Populate pattern lengths dictionary + if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List() { patterns[i] }); + else patternLengths[patterns[i].Length].Add(patterns[i]); + + ulong patternHash = 0; + ulong targetHash = 0; + ulong pow = 1; + for (int j = 0; j < patterns[i].Length; j++) + { + patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue; + if (hasToComputeTargetHashAndPow) + { + targetHash = (targetHash * alphabetSize + target[j]) % moduloValue; + if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration + pow = (pow * alphabetSize) % moduloValue; + } + } + + // Add hashes in collections + patternHashes.Add(patterns[i], patternHash); + if (hasToComputeTargetHashAndPow) + { + targetHashes.Add(patterns[i].Length, targetHash); + pows.Add(patterns[i].Length, pow); + } + } + + // Check if pattern is in the beginning + foreach (var patKVP in patternHashes) + { + if (patKVP.Value == targetHashes[patKVP.Key.Length]) + if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key)) + matches.Add(patKVP.Key, 0); + } + + // Hashing the rest of the target and searching for the pattern + // Patters are grouped by their length + foreach (var patternsWithSpecificLength in patternLengths) + { + int patternLength = patternsWithSpecificLength.Key; + int endOfSearch = target.Length - patternLength; + + for (int i = 0; i < endOfSearch; i++) + { + ulong targetHash = targetHashes[patternLength]; + + // Some Rabin-Karp magic + targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue; + targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue; + + targetHashes[patternLength] = targetHash; + + // Search all patterns for a match + foreach (var pat in patternsWithSpecificLength.Value) + { + if (!matches.ContainsKey(pat)) + { + // If the hashes are equal check the string( because collisions are possible) and return if found + if (targetHash == patternHashes[pat]) + if (string.Equals(target.Substring(i + 1, patternLength), pat)) + matches.Add(pat, i + 1); + } + + if (matches.Count == patterns.Count) return matches; + } + + if (matches.Count == patterns.Count) return matches; + } + } + + // Return matches + return matches; + } + + /// + /// Searches for all occurrences of multiple patterns in a target using Rabin-Karp's algorithm. + /// + /// The to search in. + /// A of patterns. + /// Retruns with keys of the patterns and of values of the positions at which the pattern occurs. + /// If a pattern is not found there is no entry in the dictionary. + public static Dictionary> RabinKarpMultipleSearchAll(string target, IList patterns) + { + if (target == null) throw new ArgumentNullException(nameof(target)); + if (patterns == null) throw new ArgumentNullException(nameof(patterns)); + + // Dictionary with pattern hashes for all strings + var patternHashes = new Dictionary(); + // Dictionary with target hashes for all different string lengths + var targetHashes = new Dictionary(); + // Dictionary with pow values for all different string lengths + var pows = new Dictionary(); + // Dictionary with all strings with a specific length + var patternLengths = new Dictionary>(); + // Dictionary with found positions for every string + var matches = new Dictionary>(); + + ulong alphabetSize = 256; // max char value + ulong moduloValue = 65537; // custom selected prime number for the hashing + + // Calculating hash of patterns and all target hashes and pow values + for (int i = 0; i < patterns.Count; i++) + { + // Chech if target hash for current string length has to be computed + bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length); + + // Populate matches dictionary and pattern lengths dictionary + matches.Add(patterns[i], new List()); + if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List() { patterns[i] }); + else patternLengths[patterns[i].Length].Add(patterns[i]); + + ulong patternHash = 0; + ulong targetHash = 0; + ulong pow = 1; + for (int j = 0; j < patterns[i].Length; j++) + { + patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue; + if (hasToComputeTargetHashAndPow) + { + targetHash = (targetHash * alphabetSize + target[j]) % moduloValue; + if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration + pow = (pow * alphabetSize) % moduloValue; + } + } + + // Add hashes in collections + patternHashes.Add(patterns[i], patternHash); + if (hasToComputeTargetHashAndPow) + { + targetHashes.Add(patterns[i].Length, targetHash); + pows.Add(patterns[i].Length, pow); + } + } + + // Check if pattern is in the beginning + foreach (var patKVP in patternHashes) + { + if (patKVP.Value == targetHashes[patKVP.Key.Length]) + if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key)) + matches[patKVP.Key].Add(0); + + } + + // Hashing the rest of the target and searching for the pattern + // Patters are grouped by their length + foreach (var patternsWithSpecificLength in patternLengths) + { + int patternLength = patternsWithSpecificLength.Key; + int endOfSearch = target.Length - patternLength; + + for (int i = 0; i < endOfSearch; i++) + { + ulong targetHash = targetHashes[patternLength]; + + // Some Rabin-Karp magic + targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue; + targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue; + + targetHashes[patternLength] = targetHash; + + // Search all patterns for a match + foreach (var pat in patternsWithSpecificLength.Value) + { + // If the hashes are equal check the string( because collisions are possible) and return if found + if (targetHash == patternHashes[pat]) + if (string.Equals(target.Substring(i + 1, patternLength), pat)) + matches[pat].Add(i + 1); + } + } + } + + // Remove all patterns that are not found + for (int i = 0; i < patterns.Count; i++) + { + if (matches[patterns[i]].Count == 0) + { + matches.Remove(patterns[i]); + } + } + + // Return matches + return matches; + } + } +}