From 61532ae1a03f84ff929e6a93aaed34affbf6fe1e Mon Sep 17 00:00:00 2001 From: Ogoun Date: Wed, 6 Nov 2024 03:22:09 +0300 Subject: [PATCH] Fix sleopok engine. --- Tests/PartitionFileStorageTest/Program.cs | 2 +- Tests/Sleopok.Tests/Program.cs | 168 ++++--- ZeroLevel.Sleopok.Engine/Models/IndexInfo.cs | 11 +- ZeroLevel.Sleopok.Engine/Models/SleoField.cs | 12 + .../Services/Indexes/IIndexReader.cs | 4 +- .../Services/Indexes/IndexBuilder.cs | 32 +- .../Services/Indexes/IndexReader.cs | 27 +- .../Services/Storage/DataStorage.cs | 106 +++-- ZeroLevel/Services/HashFunctions/Murmur3.cs | 17 + .../Logging/NoLimitedLogMessageBuffer.cs | 4 +- .../PartitionStorage/StoreSerializers.cs | 3 +- .../Services/Semantic/Helpers/TextAnalizer.cs | 19 +- .../Services/Semantic/Model/CharTrieNode.cs | 37 ++ .../Services/Semantic/Model/Character.cs | 9 + .../Semantic/Model/TerminalCharTrieNode.cs | 11 + .../Services/Semantic/Search/PrefixTrie.cs | 429 ++++++++++++++++++ ZeroLevel/Services/Utils/Utility.cs | 72 +++ 17 files changed, 839 insertions(+), 124 deletions(-) create mode 100644 ZeroLevel/Services/Semantic/Model/CharTrieNode.cs create mode 100644 ZeroLevel/Services/Semantic/Model/Character.cs create mode 100644 ZeroLevel/Services/Semantic/Model/TerminalCharTrieNode.cs create mode 100644 ZeroLevel/Services/Semantic/Search/PrefixTrie.cs create mode 100644 ZeroLevel/Services/Utils/Utility.cs diff --git a/Tests/PartitionFileStorageTest/Program.cs b/Tests/PartitionFileStorageTest/Program.cs index 4d07f42..bdb3e5c 100644 --- a/Tests/PartitionFileStorageTest/Program.cs +++ b/Tests/PartitionFileStorageTest/Program.cs @@ -519,7 +519,7 @@ namespace PartitionFileStorageTest EnableIndexInMemoryCachee = true }, RootFolder = folder, - FilePartition = new StoreFilePartition("Host hash", (key, _) => Math.Abs(StringHash.DotNetFullHash(key) % 367).ToString()), + FilePartition = new StoreFilePartition("Host hash", (key, _) => Math.Abs(StringHash.DotNetFullHash(key.ToLowerInvariant()) % 367).ToString()), MergeFunction = list => { ulong s = 0; diff --git a/Tests/Sleopok.Tests/Program.cs b/Tests/Sleopok.Tests/Program.cs index 40e6f90..5dac06e 100644 --- a/Tests/Sleopok.Tests/Program.cs +++ b/Tests/Sleopok.Tests/Program.cs @@ -1,4 +1,7 @@ -using ZeroLevel.Services.Semantic; +using Iveonik.Stemmers; +using ZeroLevel; +using ZeroLevel.Services.FileSystem; +using ZeroLevel.Services.Semantic; using ZeroLevel.Services.Serialization; using ZeroLevel.Sleopok.Engine; using ZeroLevel.Sleopok.Engine.Models; @@ -9,73 +12,111 @@ namespace Sleopok.Tests { internal class Program { - public sealed class BookDocument + public sealed class BookDocumentSimple { public string Id { get; set; } - [SleoIndex("title", 200.0f)] + [SleoIndex("title", 10.0f, avaliableForExactMatch: true)] public string Title { get; set; } - [SleoIndex("titlelm", 100.0f)] - public string TitleLemmas { get; set; } - - [SleoIndex("author", 10.0f)] + [SleoIndex("author", 10.0f, avaliableForExactMatch: true)] public string Author { get; set; } - - [SleoIndex("genre", 1.0f)] - public string Genre { get; set; } } - private static Dictionary _titles = new Dictionary + public sealed class BookDocument { - { "66056bc0481e83af64c55022", "Документ без названия" }, - { "6605698d481e83af64c45ad7", "На развилке дорог. Часть 2"}, - { "660581bc481e83af64cb8b4d", "Паниклав"}, - { "66057aa2481e83af64c9bb11", "Князь. Война магов (сборник)"}, - { "66057f75481e83af64cb04f7", "Антология севетского детектива-8. Компиляция. Книги 1-17"}, - { "66057bd4481e83af64ca0779", "Вор черной масти"}, - { "66057247481e83af64c76860", "Выбор"}, - { "66056807481e83af64c3a64f", "Последняя лекция"}, - { "66057f13481e83af64caed5d", "Оружие Круппа. История династии пушечных королей"}, - { "66057a37481e83af64c9a14b", "Месть Черного Дракона"}, - { "660588e8481e83af64cd2d3e", "Мгла над старыми могилами"}, - { "66056e88481e83af64c64e81", "Кровь и железо"}, - { "66057a8e481e83af64c9b673", "Маленькая страна"}, - { "6605687d481e83af64c3e360", "Санкт-Петербург – история в преданиях и легендах"}, - { "66057987481e83af64c9770c", "Контракт на рабство"}, - { "66059052481e83af64cf5e31", "Агент космического сыска"}, - { "660580f9481e83af64cb61c9", "Две жизни Алессы Коэн"}, - { "66056807481e84af64c3a64f", "Последняя история"}, - { "66057f13481e85af64caed5d", "История Китая"}, - { "66057a37481e86af64c9a14b", "Время Черного Дракона"}, - { "660588e8481e87af64cd2d3e", "Страна которой нет"}, - }; + public string Id { get; set; } + + [SleoIndex("title", 10.0f, avaliableForExactMatch: true)] + public string Title { get; set; } + + [SleoIndex("stemms", 2.0f)] + public string Stemms { get; set; } + + [SleoIndex("author", 10.0f, avaliableForExactMatch: true)] + public string Author { get; set; } + } static async Task Main(string[] args) { - // TestCompression(); - // await FillOneFieldIndex(); + //TestCompression(); // await TestSearch(); - await TestEngine(); + // await TestEngine(); + await TestEngineReadWrite(); + } + + static async Task TestEngineReadWrite() + { + ILexProvider lexProvider = new LexProvider(new RussianStemmer()); + var tempFolder = Path.Combine(Configuration.BaseDirectory, "SleoTestStorage"); + FSUtils.CleanAndTestFolder(tempFolder); + var lex = new Func(s => string.Join(" ", lexProvider.ExtractUniqueLexTokens(s).Select(s => s.Token))); + var engine = new SleoEngine(tempFolder, b => b.Id); + using (var builder = engine.CreateBuilder()) + { + await builder.Write(new[] + { + //new BookDocument { Id = "01", Title = "Юность Пушкина", Author = "Егорова Елена", Stemms = lex("Юность Пушкина") }, + new BookDocumentSimple { Id = "01", Title = "Стихи Не Для Дам", Author = "Пушкин Александр Сергеевич" }, + new BookDocumentSimple { Id = "02", Title = "Светлинен стих", Author = "Азимов Айзък" }, + }); + } + var reader = engine.CreateReader(); + var result = await reader.Search(new[] { "стихи", "пушкина" }, false); + foreach (var pair in result) + { + Console.WriteLine($"[{pair.Key}]: {pair.Value}"); + } + //await foreach (var fieldRecords in reader.GetAll()) + //{ + // Console.WriteLine(fieldRecords.Field); + //} } static async Task TestEngine() { - var engine = new SleoEngine(@"H:\Test", b => b.Id); + ILexProvider lexProvider = new LexProvider(new RussianStemmer()); + + var tempFolder = Path.Combine(Configuration.BaseDirectory, "SleoTestStorage"); + FSUtils.CleanAndTestFolder(tempFolder); + + var lex = new Func(s => string.Join(" ", lexProvider.ExtractUniqueLexTokens(s).Select(s => s.Token))); + + var engine = new SleoEngine(tempFolder, b => b.Id); using (var builder = engine.CreateBuilder()) { - builder.Write(new[] + await builder.Write(new[] { - new BookDocument{ Id = "01", Title = "Страж птица", }, - new BookDocument{ Id = "02" }, - new BookDocument{ Id = "03" }, - new BookDocument{ Id = "04" }, + new BookDocument{ Id = "01", Title = "Юность Пушкина", Author = "Егорова Елена", Stemms = lex("Юность Пушкина") }, + new BookDocument{ Id = "02", Title = "Детство Александра Пушкина", Author = "Егорова Елена Николаевна", Stemms = lex("Детство Александра Пушкина") }, + new BookDocument{ Id = "03", Title = "Избранные стихи", Author = "Александра Пушкина", Stemms = lex("Избранные стихи") }, + new BookDocument{ Id = "04", Title = "Анализ стихотворений Александра Сергеевича Пушкина", Author = "Ланцов Михаил", Stemms = lex("Анализ стихотворений Александра Сергеевича Пушкина") }, + + new BookDocument{ Id = "05", Title = "Море обаяния", Author = "Искандер Фазиль", Stemms = lex("Море обаяния") }, + new BookDocument{ Id = "06", Title = "«Какаду»", Author = "Клысь Рышард", Stemms = lex("«Какаду»") }, + new BookDocument{ Id = "07", Title = "Ряд случайных чисел [СИ]", Author = "Павлова Елена Евгеньевна", Stemms = lex("Ряд случайных чисел [СИ]") }, + new BookDocument{ Id = "08", Title = "Последняя любовь. Плен и свобода", Author = "Мятная Витамина", Stemms = lex("Последняя любовь. Плен и свобода") }, + + new BookDocument{ Id = "09", Title = "Золотой ус. Лучшие рецепты исцеления", Author = "Альменов Чингиз", Stemms = lex("Золотой ус. Лучшие рецепты исцеления") }, + new BookDocument{ Id = "10", Title = "Пушки смотрят на восток", Author = "Ефимова Марина Михайловна", Stemms = lex("Пушки смотрят на восто") }, + new BookDocument{ Id = "11", Title = "Чингиз Хан, становление", Author = "Пушной Виталий", Stemms = lex("Чингиз Хан, становление") }, }); } + + var reader = engine.CreateReader(); + var result = await reader.Search(new[] { "Елена", "Евгеньевна" }, false); + foreach (var pair in result) + { + Console.WriteLine($"[{pair.Key}]: {pair.Value}"); + } } static void TestCompression() { + var one_zip = Compressor.Compress(new[] { "02" } ); + var one_unzip = Compressor.DecompressToDocuments(one_zip); + + var strings = new string[] { string.Empty, @@ -111,30 +152,41 @@ namespace Sleopok.Tests } } - static async Task FillOneFieldIndex() + static async Task TestSearch() { - var store = new DataStorage(@"H:\TEST"); + var tempFolder = Path.Combine(Configuration.BaseDirectory, "SleoTestStorage"); + FSUtils.CleanAndTestFolder(tempFolder); + + var store = new DataStorage(tempFolder); + + using (var writer = store.GetWriter("author")) + { + await writer.Write("Козлов Игорь", "1"); + await writer.Write("Ермакова Светлана Евгеньевна", "2"); + await writer.Write("Муркок Майкл Лаумер Кейт Пик Мервин Ле Гуин Урсула Дилэни Сэмюэль Баллард Джеймс Грэм Эллисон Харлан Диксон Гордон Нивен Ларри Корнблат Сирил М Вульф Джин Лейбер Фриц Ройтер", "3"); + await writer.Write("Коллектив Авторов", "4"); + await writer.Write("Боннэр Елена Георгиевна", "5"); + await writer.Write("Звёздкина Анна ", "6"); + await writer.Complete(); + } + using (var writer = store.GetWriter("title")) { - foreach (var kv in _titles) - { - var tokens = WordTokenizer.Tokenize(kv.Value); - foreach (var t in tokens) - { - await writer.Write(t, kv.Key); - } - } + await writer.Write("Подкова на счастье", "1"); + await writer.Write("Среднеазиатская овчарка", "2"); + await writer.Write("Багряная игра. Сборник англо-американской фантастики", "3"); + await writer.Write("Управление проектами. Фундаментальный курс", "4"); + await writer.Write("Постскриптум: Книга о горьковской ссылке", "5"); + await writer.Write("Фарватер", "6"); await writer.Complete(); } - } - static async Task TestSearch() - { - var store = new DataStorage(@"H:\TEST"); - var docs = await store.GetDocuments("title", new string[] { "кровь", "страна", "железо", "история", "оружие" }, 1.0f, false); + + + var docs = await store.GetDocuments("title", new string[] { "Подкова на счастье" }, 1.0f, false); foreach (var kv in docs.OrderByDescending(kv => kv.Value)) { - Console.WriteLine($"[{kv.Key}: {kv.Value}] {_titles[kv.Key]}"); + Console.WriteLine($"[ID] = {kv.Key}: {kv.Value}"); } } } diff --git a/ZeroLevel.Sleopok.Engine/Models/IndexInfo.cs b/ZeroLevel.Sleopok.Engine/Models/IndexInfo.cs index fc45d4a..32cc95c 100644 --- a/ZeroLevel.Sleopok.Engine/Models/IndexInfo.cs +++ b/ZeroLevel.Sleopok.Engine/Models/IndexInfo.cs @@ -1,10 +1,8 @@ using System; using System.Collections.Generic; using System.Reflection; -using ZeroLevel; using ZeroLevel.Services.FileSystem; using ZeroLevel.Services.Reflection; -using ZeroLevel.Services.Extensions; namespace ZeroLevel.Sleopok.Engine.Models { @@ -34,6 +32,14 @@ namespace ZeroLevel.Sleopok.Engine.Models var sleoAttribute = member.GetCustomAttribute(); if (sleoAttribute == null) continue; + var type = SleoFieldType.Single; + if (TypeHelpers.IsGenericCollection(member.DeclaringType) + || TypeHelpers.IsArray(member.DeclaringType) + || TypeHelpers.IsEnumerable(member.DeclaringType)) + { + type = SleoFieldType.Array; + } + Func getter; switch (member.MemberType) { @@ -48,6 +54,7 @@ namespace ZeroLevel.Sleopok.Engine.Models var name = FSUtils.FileNameCorrection(string.IsNullOrWhiteSpace(sleoAttribute.Name) ? member.Name : sleoAttribute.Name); _fields.Add(new SleoField { + FieldType = type, Boost = sleoAttribute.Boost, Name = name, Getter = getter, diff --git a/ZeroLevel.Sleopok.Engine/Models/SleoField.cs b/ZeroLevel.Sleopok.Engine/Models/SleoField.cs index 21a61d7..dc86aa0 100644 --- a/ZeroLevel.Sleopok.Engine/Models/SleoField.cs +++ b/ZeroLevel.Sleopok.Engine/Models/SleoField.cs @@ -2,8 +2,20 @@ namespace ZeroLevel.Sleopok.Engine.Models { + public enum SleoFieldType + { + /// + /// One value + /// + Single = 0, + /// + /// Array of values + /// + Array = 1, + } internal sealed class SleoField { + public SleoFieldType FieldType; public string Name; public float Boost; public bool ExactMatch; diff --git a/ZeroLevel.Sleopok.Engine/Services/Indexes/IIndexReader.cs b/ZeroLevel.Sleopok.Engine/Services/Indexes/IIndexReader.cs index e5fd213..463aacc 100644 --- a/ZeroLevel.Sleopok.Engine/Services/Indexes/IIndexReader.cs +++ b/ZeroLevel.Sleopok.Engine/Services/Indexes/IIndexReader.cs @@ -1,10 +1,12 @@ using System.Collections.Generic; +using System.Linq; using System.Threading.Tasks; namespace ZeroLevel.Sleopok.Engine.Services.Indexes { public interface IIndexReader { - Task> Search(string[] tokens, bool exactMatch); + Task>> Search(string[] tokens, bool exactMatch); + IAsyncEnumerable GetAll(); } } diff --git a/ZeroLevel.Sleopok.Engine/Services/Indexes/IndexBuilder.cs b/ZeroLevel.Sleopok.Engine/Services/Indexes/IndexBuilder.cs index 66f172e..829261e 100644 --- a/ZeroLevel.Sleopok.Engine/Services/Indexes/IndexBuilder.cs +++ b/ZeroLevel.Sleopok.Engine/Services/Indexes/IndexBuilder.cs @@ -1,5 +1,8 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; +using System.Linq; using System.Threading.Tasks; +using ZeroLevel.Implementation.Semantic.Helpers; using ZeroLevel.Sleopok.Engine.Models; using ZeroLevel.Sleopok.Engine.Services.Storage; @@ -8,6 +11,8 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes internal sealed class IndexBuilder : IIndexBuilder { + private static char[] _separators = new char[] { ',', ' ', '.', '?', '!', '\\', '/', '+', '&' }; + private readonly DataStorage _storage; private readonly IndexInfo _indexInfo; private readonly Dictionary Indexers = new Dictionary(); @@ -30,6 +35,15 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes } } + private static IEnumerable Preprocess(string value) + { + if (string.IsNullOrWhiteSpace(value) == false) + { + return TextAnalizer.ExtractWords(value).Select(w=>w.ToLowerInvariant()); + } + return Enumerable.Empty(); + } + public async Task Write(IEnumerable batch) { foreach (var doc in batch) @@ -37,12 +51,20 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes var doc_id = _indexInfo.GetId(doc); foreach (var field in _indexInfo.Fields) { - var value = field.Getter(doc!)?.ToString() ?? string.Empty; - if (string.IsNullOrWhiteSpace(value) == false) + if (field.FieldType == SleoFieldType.Array) + { + // TO DO OPTIMIZATION + // Если поле уже хранит массив элементов, считать каждый элемент токеном + } + else { - foreach (var t in value.Split(' ')) + var value = field.Getter(doc!)?.ToString() ?? string.Empty; + if (string.IsNullOrWhiteSpace(value) == false) { - await Indexers[field.Name].Write(t, doc_id); + foreach (var t in Preprocess(value)) + { + await Indexers[field.Name].Write(t, doc_id); + } } } } diff --git a/ZeroLevel.Sleopok.Engine/Services/Indexes/IndexReader.cs b/ZeroLevel.Sleopok.Engine/Services/Indexes/IndexReader.cs index f428193..b7614de 100644 --- a/ZeroLevel.Sleopok.Engine/Services/Indexes/IndexReader.cs +++ b/ZeroLevel.Sleopok.Engine/Services/Indexes/IndexReader.cs @@ -1,10 +1,17 @@ using System.Collections.Generic; +using System.Linq; using System.Threading.Tasks; using ZeroLevel.Sleopok.Engine.Models; using ZeroLevel.Sleopok.Engine.Services.Storage; namespace ZeroLevel.Sleopok.Engine.Services.Indexes { + public class FieldRecords + { + public string Field { get; set; } + public Dictionary> Records { get; set; } + } + internal sealed class IndexReader : IIndexReader { @@ -16,13 +23,12 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes _indexInfo = indexInfo; } - public async Task> Search(string[] tokens, bool exactMatch) + public async Task>> Search(string[] tokens, bool exactMatch) { var documents = new Dictionary(); - foreach (var field in _indexInfo.Fields) { - if (exactMatch && field.ExactMatch == false) + if (exactMatch && field.ExactMatch == false) continue; var docs = await _storage.GetDocuments(field.Name, tokens, field.Boost, exactMatch); foreach (var doc in docs) @@ -40,7 +46,20 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes } } } - return documents; + return documents.OrderByDescending(d => d.Value); + } + + public async IAsyncEnumerable GetAll() + { + foreach (var field in _indexInfo.Fields) + { + var docs = await _storage.GetAllDocuments(field.Name); + yield return new FieldRecords + { + Field = field.Name, + Records = docs + }; + } } } } diff --git a/ZeroLevel.Sleopok.Engine/Services/Storage/DataStorage.cs b/ZeroLevel.Sleopok.Engine/Services/Storage/DataStorage.cs index d19493d..d288758 100644 --- a/ZeroLevel.Sleopok.Engine/Services/Storage/DataStorage.cs +++ b/ZeroLevel.Sleopok.Engine/Services/Storage/DataStorage.cs @@ -10,7 +10,37 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage { public sealed class DataStorage { - private readonly IStore _store; + #region Private + private class PositionDocScore + { + private float score = 0.0f; + private int _last_position = -1; + private int count = 0; + + public float GetScore(int total, bool exactMatch) + { + if (exactMatch) + { + return (count == total) ? 1.0f : 0f; + } + return (score / (float)total) * count; + } + + public void Increase(int position) + { + if (position == 0) + { + score = 1.0f; + } + else + { + var diff = position - _last_position; + score += 1.0f + 1.0f / diff; + } + _last_position = position; + count++; + } + } private class DateSourceWriter : IPartitionDataWriter @@ -26,7 +56,6 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage _builder.CompleteAdding(); _builder.Compress(); await _builder.RebuildIndex(); - _builder.Dispose(); } public async Task Write(string host, string document) @@ -41,6 +70,9 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage _builder.Dispose(); } } + #endregion + + private readonly IStore _store; public DataStorage(string rootFolder) { @@ -63,7 +95,7 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage EnableIndexInMemoryCachee = false }, RootFolder = rootFolder, - FilePartition = new StoreFilePartition("Token hash", (token, _) => Math.Abs(StringHash.DotNetFullHash(token) % 47).ToString()), + FilePartition = new StoreFilePartition("Token hash", (token, _) => Math.Abs(StringHash.DotNetFullHash(token.ToLowerInvariant()) % 47).ToString()), MergeFunction = list => { return Compressor.Compress(list.OrderBy(c => c).ToArray()); @@ -83,37 +115,14 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage return new DateSourceWriter(_store.CreateBuilder(new StoreMetadata { Field = field })); } - private class PositionDocScore - { - private float score = 0.0f; - private int _last_position = -1; - private int count = 0; - - public float GetScore(int total, bool exactMatch) - { - if (exactMatch) - { - return (count == total) ? 1.0f : 0f; - } - return (score / (float)total) * count; - } - - public void Increase(int position) - { - if (position == 0) - { - score = 1.0f; - } - else - { - var diff = position - _last_position; - score += 1.0f / diff; - } - _last_position = position; - count++; - } - } - + /// + /// Поиск документов. + /// + /// Поле по которому производится поиск. + /// Поисковый запрос. + /// Множитель ранга для результата поиска. + /// true - если искать только точные совпадения. + /// Список идентификаторов документов с коэффициентом ранжирования. public async Task> GetDocuments(string field, string[] tokens, float boost, bool exactMatch) { var documents = new Dictionary(); @@ -125,7 +134,7 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage int step = 0; foreach (var token in tokens) { - var sr = await accessor.Find(token); + var sr = await accessor.Find(token.ToLowerInvariant()); if (sr.Success) { foreach (var doc in Compressor.DecompressToDocuments(sr.Value)) @@ -137,12 +146,39 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage documents[doc].Increase(step); } } + step++; } } } return documents.ToDictionary(d => d.Key, d => boost * d.Value.GetScore(tokens.Length, exactMatch)); } + public async Task>> GetAllDocuments(string field) + { + var documents = new Dictionary>(); + var accessor = _store.CreateAccessor(new StoreMetadata { Field = field }); + if (accessor != null) + { + using (accessor) + { + await foreach (var data in accessor.Iterate()) + { + data.Deconstruct(out string key, out byte[] val); + var docs = Compressor.DecompressToDocuments(val); + if (documents.TryGetValue(key, out var documentsIds)) + { + documentsIds.AddRange(docs); + } + else + { + documents[key] = new List(docs); + } + } + } + } + return documents; + } + public async Task Dump(string key, Stream stream) { using (TextWriter writer = new StreamWriter(stream)) diff --git a/ZeroLevel/Services/HashFunctions/Murmur3.cs b/ZeroLevel/Services/HashFunctions/Murmur3.cs index 68d25da..847ff67 100644 --- a/ZeroLevel/Services/HashFunctions/Murmur3.cs +++ b/ZeroLevel/Services/HashFunctions/Murmur3.cs @@ -302,5 +302,22 @@ namespace ZeroLevel.Services.HashFunctions [MethodImpl(MethodImplOptions.AggressiveInlining)] public static ulong GetUInt64(this byte[] bb, int pos) => (ulong)(bb[pos++] | bb[pos++] << 8 | bb[pos++] << 16 | bb[pos++] << 24); + + /// + /// A 32-bit murmur3 implementation. + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int Compute(int h) + { + uint a = (uint)h; + a ^= a >> 16; + a *= 0x85ebca6b; + a ^= a >> 13; + a *= 0xc2b2ae35; + a ^= a >> 16; + return (int)a; + } } } diff --git a/ZeroLevel/Services/Logging/NoLimitedLogMessageBuffer.cs b/ZeroLevel/Services/Logging/NoLimitedLogMessageBuffer.cs index d2b4252..c6e9c0b 100644 --- a/ZeroLevel/Services/Logging/NoLimitedLogMessageBuffer.cs +++ b/ZeroLevel/Services/Logging/NoLimitedLogMessageBuffer.cs @@ -3,7 +3,7 @@ using System.Collections.Concurrent; namespace ZeroLevel.Logging { - internal sealed class NoLimitedLogMessageBuffer + internal sealed class NoLimitedLogMessageBuffer : ILogMessageBuffer { private readonly BlockingCollection> _messageQueue = @@ -15,7 +15,7 @@ namespace ZeroLevel.Logging { get { - if (_messageQueue.IsCompleted) + if (_isDisposed || _messageQueue.IsCompleted) return 0; return _messageQueue.Count; } diff --git a/ZeroLevel/Services/PartitionStorage/StoreSerializers.cs b/ZeroLevel/Services/PartitionStorage/StoreSerializers.cs index dda264b..b1dd602 100644 --- a/ZeroLevel/Services/PartitionStorage/StoreSerializers.cs +++ b/ZeroLevel/Services/PartitionStorage/StoreSerializers.cs @@ -19,7 +19,8 @@ namespace ZeroLevel.Services.PartitionStorage private readonly Func>> _inputDeserializer; private readonly Func>> _valueDeserializer; - public StoreSerializers(Func keySerializer, + public StoreSerializers( + Func keySerializer, Func inputSerializer, Func valueSerializer, Func>> keyDeserializer, diff --git a/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs b/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs index a70f4b6..e094b09 100644 --- a/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs +++ b/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs @@ -19,24 +19,18 @@ namespace ZeroLevel.Implementation.Semantic.Helpers /// Words public static IEnumerable ExtractWords(string text) { - var result = new List(); foreach (Match match in ReWord.Matches(text)) { - result.Add(match.Value); + yield return match.Value; } - - return result; } public static IEnumerable ExtractRuWords(string text) { - var result = new List(); foreach (Match match in ReRuWord.Matches(text)) { - result.Add(match.Value); + yield return match.Value; } - - return result; } /// @@ -66,23 +60,18 @@ namespace ZeroLevel.Implementation.Semantic.Helpers /// Tokens public static IEnumerable ExtractWordTokens(string text) { - var result = new List(); foreach (Match match in ReWord.Matches(text)) { - result.Add(new WordToken(match.Value, match.Index)); + yield return new WordToken(match.Value, match.Index); } - - return result; } public static IEnumerable ExtractWordTokens(string[] words) { - var result = new List(); for (int i = 0; i < words.Length; i++) { - result.Add(new WordToken(words[i], i)); + yield return new WordToken(words[i], i); } - return result; } /// diff --git a/ZeroLevel/Services/Semantic/Model/CharTrieNode.cs b/ZeroLevel/Services/Semantic/Model/CharTrieNode.cs new file mode 100644 index 0000000..f59b907 --- /dev/null +++ b/ZeroLevel/Services/Semantic/Model/CharTrieNode.cs @@ -0,0 +1,37 @@ +using System; + +namespace ZeroLevel.Services.Semantic.Model +{ + internal class CharTrieNode(char key) + { + public char Key { get; } = key; + + public virtual bool IsTerminal => false; + + public CharTrieNode[] Children { get; set; } = []; + + public void AddChild(CharTrieNode node) + { + var children = new CharTrieNode[Children.Length + 1]; + Array.Copy(Children, children, Children.Length); + children[^1] = node; + Children = children; + } + + public void RemoveChildAt(int index) + { + var children = new CharTrieNode[Children.Length - 1]; + Children[index] = Children[^1]; + Array.Copy(Children, children, children.Length); + Children = children; + } + + public void CopyChildren(CharTrieNode[] toCopy) + { + Children = new CharTrieNode[toCopy.Length]; + Array.Copy(toCopy, Children, Children.Length); + } + + public override string ToString() => $"Key: {Key}"; + } +} diff --git a/ZeroLevel/Services/Semantic/Model/Character.cs b/ZeroLevel/Services/Semantic/Model/Character.cs new file mode 100644 index 0000000..5b006c8 --- /dev/null +++ b/ZeroLevel/Services/Semantic/Model/Character.cs @@ -0,0 +1,9 @@ +namespace ZeroLevel.Services.Semantic.Model +{ + public readonly record struct Character(char Char) + { + public static Character Any { get; } = new(); + + public static implicit operator Character(char c) => new(c); + } +} diff --git a/ZeroLevel/Services/Semantic/Model/TerminalCharTrieNode.cs b/ZeroLevel/Services/Semantic/Model/TerminalCharTrieNode.cs new file mode 100644 index 0000000..80b3f34 --- /dev/null +++ b/ZeroLevel/Services/Semantic/Model/TerminalCharTrieNode.cs @@ -0,0 +1,11 @@ +namespace ZeroLevel.Services.Semantic.Model +{ + internal class TerminalCharTrieNode(char key) : CharTrieNode(key) + { + public override bool IsTerminal => true; + + public string Word { get; init; } = null!; + + public override string ToString() => $"Key: {Key}, Word: {Word}"; + } +} diff --git a/ZeroLevel/Services/Semantic/Search/PrefixTrie.cs b/ZeroLevel/Services/Semantic/Search/PrefixTrie.cs new file mode 100644 index 0000000..7f66bb3 --- /dev/null +++ b/ZeroLevel/Services/Semantic/Search/PrefixTrie.cs @@ -0,0 +1,429 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using ZeroLevel.Services.Semantic.Model; + +/// +/// https://github.com/kpol/trie +/// + +namespace ZeroLevel.Services.Semantic.Search +{ + public sealed class PrefixTrie + : ICollection, IReadOnlyCollection + { + private readonly IEqualityComparer _comparer; + + private readonly CharTrieNode _root = new(char.MinValue); + + public PrefixTrie(IEqualityComparer? comparer = null) + { + _comparer = comparer ?? EqualityComparer.Default; + } + + public int Count { get; private set; } + + bool ICollection.IsReadOnly => false; + + public bool Add(string word) + { + if(string.IsNullOrWhiteSpace(word)) throw new ArgumentException(nameof(word)); + + var (existingTerminalNode, parent) = AddNodesFromUpToBottom(word); + + if (existingTerminalNode is not null && existingTerminalNode.IsTerminal) return false; // already exists + + var newTerminalNode = new TerminalCharTrieNode(word[^1]) { Word = word }; + + AddTerminalNode(parent, existingTerminalNode, newTerminalNode, word); + + return true; + } + + public void Clear() + { + _root.Children = []; + Count = 0; + } + + public bool Contains(string word) => Contains(word.AsSpan()); + + public int IntersectionWith(string word) => IntersectionWith(word.AsSpan()); + + public int IntersectionWith(ReadOnlySpan word) + { + if (word.IsEmpty) + { + return 0; + } + return CalculateIntersection(word); + } + + public bool Contains(ReadOnlySpan word) + { + if (word.IsEmpty) + { + if (string.IsNullOrWhiteSpace(word.ToString())) throw new ArgumentException(nameof(word)); + } + + var node = GetNode(word); + + return node is not null && node.IsTerminal; + } + + public bool Remove(string word) + { + if (string.IsNullOrWhiteSpace(word)) throw new ArgumentException(nameof(word)); + + var nodesUpToBottom = GetNodesForRemoval(word); + + if (nodesUpToBottom.Count == 0) return false; + + RemoveNode(nodesUpToBottom); + + return true; + } + + public IEnumerable StartsWith(string value) + { + if (string.IsNullOrWhiteSpace(value)) throw new ArgumentException(nameof(value)); + + return _(); + + IEnumerable _() => GetTerminalNodesByPrefix(value).Select(n => n.Word); + } + + public IEnumerable Matches(IReadOnlyList pattern) + { + if (pattern == null) throw new ArgumentNullException(nameof(pattern)); + if(pattern.Count == 0) throw new ArgumentOutOfRangeException(nameof(pattern)); + + return _(); + + IEnumerable _() => + GetNodesByPattern(pattern) + .Where(n => n.IsTerminal) + .Cast() + .Select(n => n.Word); + } + + public IEnumerable StartsWith(IReadOnlyList pattern) + { + if (pattern == null) throw new ArgumentNullException(nameof(pattern)); + if (pattern.Count == 0) throw new ArgumentOutOfRangeException(nameof(pattern)); + + return _(); + + IEnumerable _() + { + foreach (var n in GetNodesByPattern(pattern)) + { + if (n.IsTerminal) + { + yield return ((TerminalCharTrieNode)n).Word; + } + + foreach (var terminalNode in GetDescendantTerminalNodes(n)) + { + yield return terminalNode.Word; + } + } + } + } + + internal (CharTrieNode? existingTerminalNode, CharTrieNode parent) AddNodesFromUpToBottom(ReadOnlySpan word) + { + var current = _root; + + for (int i = 0; i < word.Length - 1; i++) + { + var n = GetChildNode(current, word[i]); + + if (n is not null) + { + current = n; + } + else + { + CharTrieNode node = new(word[i]); + AddToNode(current, node); + current = node; + } + } + + var terminalNode = GetChildNode(current, word[^1]); + + return (terminalNode, current); + } + + internal void AddTerminalNode(CharTrieNode parent, CharTrieNode? existingNode, CharTrieNode newTerminalNode, string word) + { + if (existingNode is not null) + { + newTerminalNode.CopyChildren(existingNode.Children); + + RemoveChildFromNode(parent, word[^1]); + } + + AddToNode(parent, newTerminalNode); + Count++; + } + + internal IEnumerable GetTerminalNodesByPrefix(ReadOnlySpan prefix) + { + var node = GetNode(prefix); + return GetTerminalNodes(node); + } + + private IEnumerable GetTerminalNodes(CharTrieNode? node) + { + if (node is null) + { + yield break; + } + + if (node.IsTerminal) + { + yield return (TerminalCharTrieNode)node; + } + + foreach (var n in GetDescendantTerminalNodes(node)) + { + yield return n; + } + } + + public IEnumerator GetEnumerator() => GetAllTerminalNodes().Select(n => n.Word).GetEnumerator(); + + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); + + void ICollection.Add(string word) + { + if (string.IsNullOrWhiteSpace(word)) throw new ArgumentException(nameof(word)); + + Add(word); + } + + void ICollection.CopyTo(string[] array, int arrayIndex) + { + if(array == null) throw new ArgumentNullException(nameof(array)); + if(arrayIndex < 0) throw new ArgumentOutOfRangeException(nameof(arrayIndex)); + + if (Count > array.Length - arrayIndex) + { + throw new ArgumentException( + "The number of elements in the trie is greater than the available space from index to the end of the destination array."); + } + + foreach (var node in GetAllTerminalNodes()) + { + array[arrayIndex++] = node.Word; + } + } + + internal IEnumerable GetAllTerminalNodes() => GetDescendantTerminalNodes(_root); + + internal static IEnumerable GetDescendantTerminalNodes(CharTrieNode node) + { + Queue queue = new(node.Children); + + while (queue.Count > 0) + { + var n = queue.Dequeue(); + + if (n.IsTerminal) + { + yield return (TerminalCharTrieNode)n; + } + + for (var i = 0; i < n.Children.Length; i++) + { + queue.Enqueue(n.Children[i]); + } + } + } + + internal int CalculateIntersection(ReadOnlySpan prefix) + { + var current = _root; + + for (var i = 0; i < prefix.Length; i++) + { + current = GetChildNode(current, prefix[i]); + + if (current is null) + { + return i; + } + } + + return prefix.Length; + } + + internal CharTrieNode? GetNode(ReadOnlySpan prefix) + { + var current = _root; + + for (var i = 0; i < prefix.Length; i++) + { + current = GetChildNode(current, prefix[i]); + + if (current is null) + { + return null; + } + } + + return current; + } + + internal IEnumerable GetNodesByPattern(IReadOnlyList pattern) + { + Queue<(CharTrieNode node, int index)> queue = []; + queue.Enqueue((_root, 0)); + + while (queue.Count > 0) + { + var (node, index) = queue.Dequeue(); + + if (index == pattern.Count - 1) + { + if (pattern[index] != Character.Any) + { + var n = GetChildNode(node, pattern[index].Char); + + if (n is not null) + { + yield return n; + } + } + else + { + for (var i = 0; i < node.Children.Length; i++) + { + yield return node.Children[i]; + } + } + } + else + { + if (pattern[index] != Character.Any) + { + var n = GetChildNode(node, pattern[index].Char); + + if (n is not null) + { + queue.Enqueue((n, index + 1)); + } + } + else + { + for (var i = 0; i < node.Children.Length; i++) + { + queue.Enqueue((node.Children[i], index + 1)); + } + } + } + } + } + + private Stack GetNodesForRemoval(string prefix) + { + var current = _root; + + Stack nodesUpToBottom = []; + nodesUpToBottom.Push(_root); + + for (var i = 0; i < prefix.Length; i++) + { + var c = prefix[i]; + current = GetChildNode(current, c); + + if (current is not null) + { + nodesUpToBottom.Push(current); + } + else + { + return []; + } + } + + return current.IsTerminal ? nodesUpToBottom : []; + } + + private void RemoveNode(Stack nodesUpToBottom) + { + Count--; + + var node = nodesUpToBottom.Pop(); + + if (node.Children.Length == 0) + { + while (node.Children.Length == 0 && nodesUpToBottom.Count > 0) + { + var parent = nodesUpToBottom.Pop(); + RemoveChildFromNode(parent, node.Key); + + if (parent.IsTerminal) return; + + node = parent; + + } + } + else + { + // convert node to non-terminal node + CharTrieNode n = new(node.Key); + n.CopyChildren(node.Children); + + var parent = nodesUpToBottom.Count == 0 ? _root : nodesUpToBottom.Pop(); + + RemoveChildFromNode(parent, node.Key); + AddToNode(parent, n); + } + } + + private void AddToNode(CharTrieNode node, CharTrieNode nodeToAdd) + { + for (var i = 0; i < node.Children.Length; i++) + { + if (_comparer.Equals(nodeToAdd.Key, node.Children[i].Key)) + { + return; + } + } + + node.AddChild(nodeToAdd); + } + + private void RemoveChildFromNode(CharTrieNode node, char key) + { + for (int i = 0; i < node.Children.Length; i++) + { + if (_comparer.Equals(key, node.Children[i].Key)) + { + node.RemoveChildAt(i); + + break; + } + } + } + + private CharTrieNode? GetChildNode(CharTrieNode node, char key) + { + for (var i = 0; i < node.Children.Length; i++) + { + var n = node.Children[i]; + + if (_comparer.Equals(key, n.Key)) + { + return n; + } + } + + return null; + } + } +} diff --git a/ZeroLevel/Services/Utils/Utility.cs b/ZeroLevel/Services/Utils/Utility.cs new file mode 100644 index 0000000..5a715a3 --- /dev/null +++ b/ZeroLevel/Services/Utils/Utility.cs @@ -0,0 +1,72 @@ +using System; + +namespace ZeroLevel.Services.Utils +{ + public static class Utility + { + /// + /// Parse size in string notation into long. + /// Examples: 4k, 4K, 4KB, 4 KB, 8m, 8MB, 12g, 12 GB, 16t, 16 TB, 32p, 32 PB. + /// + /// String version of number + /// The number + public static long ParseSize(string value) + { + char[] suffix = ['k', 'm', 'g', 't', 'p']; + long result = 0; + foreach (char c in value) + { + if (char.IsDigit(c)) + { + result = result * 10 + (byte)c - '0'; + } + else + { + for (int i = 0; i < suffix.Length; i++) + { + if (char.ToLower(c) == suffix[i]) + { + result *= (long)Math.Pow(1024, i + 1); + return result; + } + } + } + } + return result; + } + + /// + /// Pretty print value + /// + /// + /// + internal static string PrettySize(long value) + { + char[] suffix = ['K', 'M', 'G', 'T', 'P']; + double v = value; + int exp = 0; + while (v - Math.Floor(v) > 0) + { + if (exp >= 18) + break; + exp += 3; + v *= 1024; + v = Math.Round(v, 12); + } + + while (Math.Floor(v).ToString().Length > 3) + { + if (exp <= -18) + break; + exp -= 3; + v /= 1024; + v = Math.Round(v, 12); + } + if (exp > 0) + return v.ToString() + suffix[exp / 3 - 1] + "B"; + else if (exp < 0) + return v.ToString() + suffix[-exp / 3 - 1] + "B"; + return v.ToString() + "B"; + } + } +}