Fix sleopok engine.

master
Ogoun 2 weeks ago
parent d534fb3871
commit 61532ae1a0

@ -519,7 +519,7 @@ namespace PartitionFileStorageTest
EnableIndexInMemoryCachee = true
},
RootFolder = folder,
FilePartition = new StoreFilePartition<string, StoreMetadata>("Host hash", (key, _) => Math.Abs(StringHash.DotNetFullHash(key) % 367).ToString()),
FilePartition = new StoreFilePartition<string, StoreMetadata>("Host hash", (key, _) => Math.Abs(StringHash.DotNetFullHash(key.ToLowerInvariant()) % 367).ToString()),
MergeFunction = list =>
{
ulong s = 0;

@ -1,4 +1,7 @@
using ZeroLevel.Services.Semantic;
using Iveonik.Stemmers;
using ZeroLevel;
using ZeroLevel.Services.FileSystem;
using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Serialization;
using ZeroLevel.Sleopok.Engine;
using ZeroLevel.Sleopok.Engine.Models;
@ -9,73 +12,111 @@ namespace Sleopok.Tests
{
internal class Program
{
public sealed class BookDocumentSimple
{
public string Id { get; set; }
[SleoIndex("title", 10.0f, avaliableForExactMatch: true)]
public string Title { get; set; }
[SleoIndex("author", 10.0f, avaliableForExactMatch: true)]
public string Author { get; set; }
}
public sealed class BookDocument
{
public string Id { get; set; }
[SleoIndex("title", 200.0f)]
[SleoIndex("title", 10.0f, avaliableForExactMatch: true)]
public string Title { get; set; }
[SleoIndex("titlelm", 100.0f)]
public string TitleLemmas { get; set; }
[SleoIndex("stemms", 2.0f)]
public string Stemms { get; set; }
[SleoIndex("author", 10.0f)]
[SleoIndex("author", 10.0f, avaliableForExactMatch: true)]
public string Author { get; set; }
[SleoIndex("genre", 1.0f)]
public string Genre { get; set; }
}
private static Dictionary<string, string> _titles = new Dictionary<string, string>
{
{ "66056bc0481e83af64c55022", "Документ без названия" },
{ "6605698d481e83af64c45ad7", "На развилке дорог. Часть 2"},
{ "660581bc481e83af64cb8b4d", "Паниклав"},
{ "66057aa2481e83af64c9bb11", "Князь. Война магов (сборник)"},
{ "66057f75481e83af64cb04f7", "Антология севетского детектива-8. Компиляция. Книги 1-17"},
{ "66057bd4481e83af64ca0779", "Вор черной масти"},
{ "66057247481e83af64c76860", "Выбор"},
{ "66056807481e83af64c3a64f", "Последняя лекция"},
{ "66057f13481e83af64caed5d", "Оружие Круппа. История династии пушечных королей"},
{ "66057a37481e83af64c9a14b", "Месть Черного Дракона"},
{ "660588e8481e83af64cd2d3e", "Мгла над старыми могилами"},
{ "66056e88481e83af64c64e81", "Кровь и железо"},
{ "66057a8e481e83af64c9b673", "Маленькая страна"},
{ "6605687d481e83af64c3e360", "Санкт-Петербург история в преданиях и легендах"},
{ "66057987481e83af64c9770c", "Контракт на рабство"},
{ "66059052481e83af64cf5e31", "Агент космического сыска"},
{ "660580f9481e83af64cb61c9", "Две жизни Алессы Коэн"},
{ "66056807481e84af64c3a64f", "Последняя история"},
{ "66057f13481e85af64caed5d", "История Китая"},
{ "66057a37481e86af64c9a14b", "Время Черного Дракона"},
{ "660588e8481e87af64cd2d3e", "Страна которой нет"},
};
}
static async Task Main(string[] args)
{
//TestCompression();
// await FillOneFieldIndex();
// await TestSearch();
await TestEngine();
// await TestEngine();
await TestEngineReadWrite();
}
static async Task TestEngineReadWrite()
{
ILexProvider lexProvider = new LexProvider(new RussianStemmer());
var tempFolder = Path.Combine(Configuration.BaseDirectory, "SleoTestStorage");
FSUtils.CleanAndTestFolder(tempFolder);
var lex = new Func<string, string>(s => string.Join(" ", lexProvider.ExtractUniqueLexTokens(s).Select(s => s.Token)));
var engine = new SleoEngine<BookDocumentSimple>(tempFolder, b => b.Id);
using (var builder = engine.CreateBuilder())
{
await builder.Write(new[]
{
//new BookDocument { Id = "01", Title = "Юность Пушкина", Author = "Егорова Елена", Stemms = lex("Юность Пушкина") },
new BookDocumentSimple { Id = "01", Title = "Стихи Не Для Дам", Author = "Пушкин Александр Сергеевич" },
new BookDocumentSimple { Id = "02", Title = "Светлинен стих", Author = "Азимов Айзък" },
});
}
var reader = engine.CreateReader();
var result = await reader.Search(new[] { "стихи", "пушкина" }, false);
foreach (var pair in result)
{
Console.WriteLine($"[{pair.Key}]: {pair.Value}");
}
//await foreach (var fieldRecords in reader.GetAll())
//{
// Console.WriteLine(fieldRecords.Field);
//}
}
static async Task TestEngine()
{
var engine = new SleoEngine<BookDocument>(@"H:\Test", b => b.Id);
ILexProvider lexProvider = new LexProvider(new RussianStemmer());
var tempFolder = Path.Combine(Configuration.BaseDirectory, "SleoTestStorage");
FSUtils.CleanAndTestFolder(tempFolder);
var lex = new Func<string, string>(s => string.Join(" ", lexProvider.ExtractUniqueLexTokens(s).Select(s => s.Token)));
var engine = new SleoEngine<BookDocument>(tempFolder, b => b.Id);
using (var builder = engine.CreateBuilder())
{
builder.Write(new[]
await builder.Write(new[]
{
new BookDocument{ Id = "01", Title = "Страж птица", },
new BookDocument{ Id = "02" },
new BookDocument{ Id = "03" },
new BookDocument{ Id = "04" },
new BookDocument{ Id = "01", Title = "Юность Пушкина", Author = "Егорова Елена", Stemms = lex("Юность Пушкина") },
new BookDocument{ Id = "02", Title = "Детство Александра Пушкина", Author = "Егорова Елена Николаевна", Stemms = lex("Детство Александра Пушкина") },
new BookDocument{ Id = "03", Title = "Избранные стихи", Author = "Александра Пушкина", Stemms = lex("Избранные стихи") },
new BookDocument{ Id = "04", Title = "Анализ стихотворений Александра Сергеевича Пушкина", Author = "Ланцов Михаил", Stemms = lex("Анализ стихотворений Александра Сергеевича Пушкина") },
new BookDocument{ Id = "05", Title = "Море обаяния", Author = "Искандер Фазиль", Stemms = lex("Море обаяния") },
new BookDocument{ Id = "06", Title = "«Какаду»", Author = "Клысь Рышард", Stemms = lex("«Какаду»") },
new BookDocument{ Id = "07", Title = "Ряд случайных чисел [СИ]", Author = "Павлова Елена Евгеньевна", Stemms = lex("Ряд случайных чисел [СИ]") },
new BookDocument{ Id = "08", Title = "Последняя любовь. Плен и свобода", Author = "Мятная Витамина", Stemms = lex("Последняя любовь. Плен и свобода") },
new BookDocument{ Id = "09", Title = "Золотой ус. Лучшие рецепты исцеления", Author = "Альменов Чингиз", Stemms = lex("Золотой ус. Лучшие рецепты исцеления") },
new BookDocument{ Id = "10", Title = "Пушки смотрят на восток", Author = "Ефимова Марина Михайловна", Stemms = lex("Пушки смотрят на восто") },
new BookDocument{ Id = "11", Title = "Чингиз Хан, становление", Author = "Пушной Виталий", Stemms = lex("Чингиз Хан, становление") },
});
}
var reader = engine.CreateReader();
var result = await reader.Search(new[] { "Елена", "Евгеньевна" }, false);
foreach (var pair in result)
{
Console.WriteLine($"[{pair.Key}]: {pair.Value}");
}
}
static void TestCompression()
{
var one_zip = Compressor.Compress(new[] { "02" } );
var one_unzip = Compressor.DecompressToDocuments(one_zip);
var strings = new string[]
{
string.Empty,
@ -111,30 +152,41 @@ namespace Sleopok.Tests
}
}
static async Task FillOneFieldIndex()
static async Task TestSearch()
{
var store = new DataStorage(@"H:\TEST");
var tempFolder = Path.Combine(Configuration.BaseDirectory, "SleoTestStorage");
FSUtils.CleanAndTestFolder(tempFolder);
var store = new DataStorage(tempFolder);
using (var writer = store.GetWriter("author"))
{
await writer.Write("Козлов Игорь", "1");
await writer.Write("Ермакова Светлана Евгеньевна", "2");
await writer.Write("Муркок Майкл Лаумер Кейт Пик Мервин Ле Гуин Урсула Дилэни Сэмюэль Баллард Джеймс Грэм Эллисон Харлан Диксон Гордон Нивен Ларри Корнблат Сирил М Вульф Джин Лейбер Фриц Ройтер", "3");
await writer.Write("Коллектив Авторов", "4");
await writer.Write("Боннэр Елена Георгиевна", "5");
await writer.Write("Звёздкина Анна ", "6");
await writer.Complete();
}
using (var writer = store.GetWriter("title"))
{
foreach (var kv in _titles)
{
var tokens = WordTokenizer.Tokenize(kv.Value);
foreach (var t in tokens)
{
await writer.Write(t, kv.Key);
}
}
await writer.Write("Подкова на счастье", "1");
await writer.Write("Среднеазиатская овчарка", "2");
await writer.Write("Багряная игра. Сборник англо-американской фантастики", "3");
await writer.Write("Управление проектами. Фундаментальный курс", "4");
await writer.Write("Постскриптум: Книга о горьковской ссылке", "5");
await writer.Write("Фарватер", "6");
await writer.Complete();
}
}
static async Task TestSearch()
{
var store = new DataStorage(@"H:\TEST");
var docs = await store.GetDocuments("title", new string[] { "кровь", "страна", "железо", "история", "оружие" }, 1.0f, false);
var docs = await store.GetDocuments("title", new string[] { "Подкова на счастье" }, 1.0f, false);
foreach (var kv in docs.OrderByDescending(kv => kv.Value))
{
Console.WriteLine($"[{kv.Key}: {kv.Value}] {_titles[kv.Key]}");
Console.WriteLine($"[ID] = {kv.Key}: {kv.Value}");
}
}
}

@ -1,10 +1,8 @@
using System;
using System.Collections.Generic;
using System.Reflection;
using ZeroLevel;
using ZeroLevel.Services.FileSystem;
using ZeroLevel.Services.Reflection;
using ZeroLevel.Services.Extensions;
namespace ZeroLevel.Sleopok.Engine.Models
{
@ -34,6 +32,14 @@ namespace ZeroLevel.Sleopok.Engine.Models
var sleoAttribute = member.GetCustomAttribute<SleoIndexAttribute>();
if (sleoAttribute == null) continue;
var type = SleoFieldType.Single;
if (TypeHelpers.IsGenericCollection(member.DeclaringType)
|| TypeHelpers.IsArray(member.DeclaringType)
|| TypeHelpers.IsEnumerable(member.DeclaringType))
{
type = SleoFieldType.Array;
}
Func<object, object> getter;
switch (member.MemberType)
{
@ -48,6 +54,7 @@ namespace ZeroLevel.Sleopok.Engine.Models
var name = FSUtils.FileNameCorrection(string.IsNullOrWhiteSpace(sleoAttribute.Name) ? member.Name : sleoAttribute.Name);
_fields.Add(new SleoField
{
FieldType = type,
Boost = sleoAttribute.Boost,
Name = name,
Getter = getter,

@ -2,8 +2,20 @@
namespace ZeroLevel.Sleopok.Engine.Models
{
public enum SleoFieldType
{
/// <summary>
/// One value
/// </summary>
Single = 0,
/// <summary>
/// Array of values
/// </summary>
Array = 1,
}
internal sealed class SleoField
{
public SleoFieldType FieldType;
public string Name;
public float Boost;
public bool ExactMatch;

@ -1,10 +1,12 @@
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
namespace ZeroLevel.Sleopok.Engine.Services.Indexes
{
public interface IIndexReader<T>
{
Task<Dictionary<string, float>> Search(string[] tokens, bool exactMatch);
Task<IOrderedEnumerable<KeyValuePair<string, float>>> Search(string[] tokens, bool exactMatch);
IAsyncEnumerable<FieldRecords> GetAll();
}
}

@ -1,5 +1,8 @@
using System.Collections.Generic;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using ZeroLevel.Implementation.Semantic.Helpers;
using ZeroLevel.Sleopok.Engine.Models;
using ZeroLevel.Sleopok.Engine.Services.Storage;
@ -8,6 +11,8 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes
internal sealed class IndexBuilder<T>
: IIndexBuilder<T>
{
private static char[] _separators = new char[] { ',', ' ', '.', '?', '!', '\\', '/', '+', '&' };
private readonly DataStorage _storage;
private readonly IndexInfo<T> _indexInfo;
private readonly Dictionary<string, IPartitionDataWriter> Indexers = new Dictionary<string, IPartitionDataWriter>();
@ -30,17 +35,33 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes
}
}
private static IEnumerable<string> Preprocess(string value)
{
if (string.IsNullOrWhiteSpace(value) == false)
{
return TextAnalizer.ExtractWords(value).Select(w=>w.ToLowerInvariant());
}
return Enumerable.Empty<string>();
}
public async Task Write(IEnumerable<T> batch)
{
foreach (var doc in batch)
{
var doc_id = _indexInfo.GetId(doc);
foreach (var field in _indexInfo.Fields)
{
if (field.FieldType == SleoFieldType.Array)
{
// TO DO OPTIMIZATION
// Если поле уже хранит массив элементов, считать каждый элемент токеном
}
else
{
var value = field.Getter(doc!)?.ToString() ?? string.Empty;
if (string.IsNullOrWhiteSpace(value) == false)
{
foreach (var t in value.Split(' '))
foreach (var t in Preprocess(value))
{
await Indexers[field.Name].Write(t, doc_id);
}
@ -48,6 +69,7 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes
}
}
}
}
public void Dispose()
{

@ -1,10 +1,17 @@
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using ZeroLevel.Sleopok.Engine.Models;
using ZeroLevel.Sleopok.Engine.Services.Storage;
namespace ZeroLevel.Sleopok.Engine.Services.Indexes
{
public class FieldRecords
{
public string Field { get; set; }
public Dictionary<string, List<string>> Records { get; set; }
}
internal sealed class IndexReader<T>
: IIndexReader<T>
{
@ -16,10 +23,9 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes
_indexInfo = indexInfo;
}
public async Task<Dictionary<string, float>> Search(string[] tokens, bool exactMatch)
public async Task<IOrderedEnumerable<KeyValuePair<string, float>>> Search(string[] tokens, bool exactMatch)
{
var documents = new Dictionary<string, float>();
foreach (var field in _indexInfo.Fields)
{
if (exactMatch && field.ExactMatch == false)
@ -40,7 +46,20 @@ namespace ZeroLevel.Sleopok.Engine.Services.Indexes
}
}
}
return documents;
return documents.OrderByDescending(d => d.Value);
}
public async IAsyncEnumerable<FieldRecords> GetAll()
{
foreach (var field in _indexInfo.Fields)
{
var docs = await _storage.GetAllDocuments(field.Name);
yield return new FieldRecords
{
Field = field.Name,
Records = docs
};
}
}
}
}

@ -10,7 +10,37 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage
{
public sealed class DataStorage
{
private readonly IStore<string, string, byte[], StoreMetadata> _store;
#region Private
private class PositionDocScore
{
private float score = 0.0f;
private int _last_position = -1;
private int count = 0;
public float GetScore(int total, bool exactMatch)
{
if (exactMatch)
{
return (count == total) ? 1.0f : 0f;
}
return (score / (float)total) * count;
}
public void Increase(int position)
{
if (position == 0)
{
score = 1.0f;
}
else
{
var diff = position - _last_position;
score += 1.0f + 1.0f / diff;
}
_last_position = position;
count++;
}
}
private class DateSourceWriter :
IPartitionDataWriter
@ -26,7 +56,6 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage
_builder.CompleteAdding();
_builder.Compress();
await _builder.RebuildIndex();
_builder.Dispose();
}
public async Task Write(string host, string document)
@ -41,6 +70,9 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage
_builder.Dispose();
}
}
#endregion
private readonly IStore<string, string, byte[], StoreMetadata> _store;
public DataStorage(string rootFolder)
{
@ -63,7 +95,7 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage
EnableIndexInMemoryCachee = false
},
RootFolder = rootFolder,
FilePartition = new StoreFilePartition<string, StoreMetadata>("Token hash", (token, _) => Math.Abs(StringHash.DotNetFullHash(token) % 47).ToString()),
FilePartition = new StoreFilePartition<string, StoreMetadata>("Token hash", (token, _) => Math.Abs(StringHash.DotNetFullHash(token.ToLowerInvariant()) % 47).ToString()),
MergeFunction = list =>
{
return Compressor.Compress(list.OrderBy(c => c).ToArray());
@ -83,37 +115,14 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage
return new DateSourceWriter(_store.CreateBuilder(new StoreMetadata { Field = field }));
}
private class PositionDocScore
{
private float score = 0.0f;
private int _last_position = -1;
private int count = 0;
public float GetScore(int total, bool exactMatch)
{
if (exactMatch)
{
return (count == total) ? 1.0f : 0f;
}
return (score / (float)total) * count;
}
public void Increase(int position)
{
if (position == 0)
{
score = 1.0f;
}
else
{
var diff = position - _last_position;
score += 1.0f / diff;
}
_last_position = position;
count++;
}
}
/// <summary>
/// Поиск документов.
/// </summary>
/// <param name="field">Поле по которому производится поиск.</param>
/// <param name="tokens">Поисковый запрос.</param>
/// <param name="boost">Множитель ранга для результата поиска.</param>
/// <param name="exactMatch">true - если искать только точные совпадения.</param>
/// <returns>Список идентификаторов документов с коэффициентом ранжирования.</returns>
public async Task<Dictionary<string, float>> GetDocuments(string field, string[] tokens, float boost, bool exactMatch)
{
var documents = new Dictionary<string, PositionDocScore>();
@ -125,7 +134,7 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage
int step = 0;
foreach (var token in tokens)
{
var sr = await accessor.Find(token);
var sr = await accessor.Find(token.ToLowerInvariant());
if (sr.Success)
{
foreach (var doc in Compressor.DecompressToDocuments(sr.Value))
@ -137,12 +146,39 @@ namespace ZeroLevel.Sleopok.Engine.Services.Storage
documents[doc].Increase(step);
}
}
step++;
}
}
}
return documents.ToDictionary(d => d.Key, d => boost * d.Value.GetScore(tokens.Length, exactMatch));
}
public async Task<Dictionary<string, List<string>>> GetAllDocuments(string field)
{
var documents = new Dictionary<string, List<string>>();
var accessor = _store.CreateAccessor(new StoreMetadata { Field = field });
if (accessor != null)
{
using (accessor)
{
await foreach (var data in accessor.Iterate())
{
data.Deconstruct(out string key, out byte[] val);
var docs = Compressor.DecompressToDocuments(val);
if (documents.TryGetValue(key, out var documentsIds))
{
documentsIds.AddRange(docs);
}
else
{
documents[key] = new List<string>(docs);
}
}
}
}
return documents;
}
public async Task Dump(string key, Stream stream)
{
using (TextWriter writer = new StreamWriter(stream))

@ -302,5 +302,22 @@ namespace ZeroLevel.Services.HashFunctions
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong GetUInt64(this byte[] bb, int pos) =>
(ulong)(bb[pos++] | bb[pos++] << 8 | bb[pos++] << 16 | bb[pos++] << 24);
/// <summary>
/// A 32-bit murmur3 implementation.
/// </summary>
/// <param name="h"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int Compute(int h)
{
uint a = (uint)h;
a ^= a >> 16;
a *= 0x85ebca6b;
a ^= a >> 13;
a *= 0xc2b2ae35;
a ^= a >> 16;
return (int)a;
}
}
}

@ -15,7 +15,7 @@ namespace ZeroLevel.Logging
{
get
{
if (_messageQueue.IsCompleted)
if (_isDisposed || _messageQueue.IsCompleted)
return 0;
return _messageQueue.Count;
}

@ -19,7 +19,8 @@ namespace ZeroLevel.Services.PartitionStorage
private readonly Func<MemoryStreamReader, Task<DeserializeResult<TInput>>> _inputDeserializer;
private readonly Func<MemoryStreamReader, Task<DeserializeResult<TValue>>> _valueDeserializer;
public StoreSerializers(Func<MemoryStreamWriter, TKey, Task> keySerializer,
public StoreSerializers(
Func<MemoryStreamWriter, TKey, Task> keySerializer,
Func<MemoryStreamWriter, TInput, Task> inputSerializer,
Func<MemoryStreamWriter, TValue, Task> valueSerializer,
Func<MemoryStreamReader, Task<DeserializeResult<TKey>>> keyDeserializer,

@ -19,24 +19,18 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
/// <returns>Words</returns>
public static IEnumerable<string> ExtractWords(string text)
{
var result = new List<string>();
foreach (Match match in ReWord.Matches(text))
{
result.Add(match.Value);
yield return match.Value;
}
return result;
}
public static IEnumerable<string> ExtractRuWords(string text)
{
var result = new List<string>();
foreach (Match match in ReRuWord.Matches(text))
{
result.Add(match.Value);
yield return match.Value;
}
return result;
}
/// <summary>
@ -66,23 +60,18 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
/// <returns>Tokens</returns>
public static IEnumerable<WordToken> ExtractWordTokens(string text)
{
var result = new List<WordToken>();
foreach (Match match in ReWord.Matches(text))
{
result.Add(new WordToken(match.Value, match.Index));
yield return new WordToken(match.Value, match.Index);
}
return result;
}
public static IEnumerable<WordToken> ExtractWordTokens(string[] words)
{
var result = new List<WordToken>();
for (int i = 0; i < words.Length; i++)
{
result.Add(new WordToken(words[i], i));
yield return new WordToken(words[i], i);
}
return result;
}
/// <summary>

@ -0,0 +1,37 @@
using System;
namespace ZeroLevel.Services.Semantic.Model
{
internal class CharTrieNode(char key)
{
public char Key { get; } = key;
public virtual bool IsTerminal => false;
public CharTrieNode[] Children { get; set; } = [];
public void AddChild(CharTrieNode node)
{
var children = new CharTrieNode[Children.Length + 1];
Array.Copy(Children, children, Children.Length);
children[^1] = node;
Children = children;
}
public void RemoveChildAt(int index)
{
var children = new CharTrieNode[Children.Length - 1];
Children[index] = Children[^1];
Array.Copy(Children, children, children.Length);
Children = children;
}
public void CopyChildren(CharTrieNode[] toCopy)
{
Children = new CharTrieNode[toCopy.Length];
Array.Copy(toCopy, Children, Children.Length);
}
public override string ToString() => $"Key: {Key}";
}
}

@ -0,0 +1,9 @@
namespace ZeroLevel.Services.Semantic.Model
{
public readonly record struct Character(char Char)
{
public static Character Any { get; } = new();
public static implicit operator Character(char c) => new(c);
}
}

@ -0,0 +1,11 @@
namespace ZeroLevel.Services.Semantic.Model
{
internal class TerminalCharTrieNode(char key) : CharTrieNode(key)
{
public override bool IsTerminal => true;
public string Word { get; init; } = null!;
public override string ToString() => $"Key: {Key}, Word: {Word}";
}
}

@ -0,0 +1,429 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using ZeroLevel.Services.Semantic.Model;
///
/// https://github.com/kpol/trie
///
namespace ZeroLevel.Services.Semantic.Search
{
public sealed class PrefixTrie
: ICollection<string>, IReadOnlyCollection<string>
{
private readonly IEqualityComparer<char> _comparer;
private readonly CharTrieNode _root = new(char.MinValue);
public PrefixTrie(IEqualityComparer<char>? comparer = null)
{
_comparer = comparer ?? EqualityComparer<char>.Default;
}
public int Count { get; private set; }
bool ICollection<string>.IsReadOnly => false;
public bool Add(string word)
{
if(string.IsNullOrWhiteSpace(word)) throw new ArgumentException(nameof(word));
var (existingTerminalNode, parent) = AddNodesFromUpToBottom(word);
if (existingTerminalNode is not null && existingTerminalNode.IsTerminal) return false; // already exists
var newTerminalNode = new TerminalCharTrieNode(word[^1]) { Word = word };
AddTerminalNode(parent, existingTerminalNode, newTerminalNode, word);
return true;
}
public void Clear()
{
_root.Children = [];
Count = 0;
}
public bool Contains(string word) => Contains(word.AsSpan());
public int IntersectionWith(string word) => IntersectionWith(word.AsSpan());
public int IntersectionWith(ReadOnlySpan<char> word)
{
if (word.IsEmpty)
{
return 0;
}
return CalculateIntersection(word);
}
public bool Contains(ReadOnlySpan<char> word)
{
if (word.IsEmpty)
{
if (string.IsNullOrWhiteSpace(word.ToString())) throw new ArgumentException(nameof(word));
}
var node = GetNode(word);
return node is not null && node.IsTerminal;
}
public bool Remove(string word)
{
if (string.IsNullOrWhiteSpace(word)) throw new ArgumentException(nameof(word));
var nodesUpToBottom = GetNodesForRemoval(word);
if (nodesUpToBottom.Count == 0) return false;
RemoveNode(nodesUpToBottom);
return true;
}
public IEnumerable<string> StartsWith(string value)
{
if (string.IsNullOrWhiteSpace(value)) throw new ArgumentException(nameof(value));
return _();
IEnumerable<string> _() => GetTerminalNodesByPrefix(value).Select(n => n.Word);
}
public IEnumerable<string> Matches(IReadOnlyList<Character> pattern)
{
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
if(pattern.Count == 0) throw new ArgumentOutOfRangeException(nameof(pattern));
return _();
IEnumerable<string> _() =>
GetNodesByPattern(pattern)
.Where(n => n.IsTerminal)
.Cast<TerminalCharTrieNode>()
.Select(n => n.Word);
}
public IEnumerable<string> StartsWith(IReadOnlyList<Character> pattern)
{
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
if (pattern.Count == 0) throw new ArgumentOutOfRangeException(nameof(pattern));
return _();
IEnumerable<string> _()
{
foreach (var n in GetNodesByPattern(pattern))
{
if (n.IsTerminal)
{
yield return ((TerminalCharTrieNode)n).Word;
}
foreach (var terminalNode in GetDescendantTerminalNodes(n))
{
yield return terminalNode.Word;
}
}
}
}
internal (CharTrieNode? existingTerminalNode, CharTrieNode parent) AddNodesFromUpToBottom(ReadOnlySpan<char> word)
{
var current = _root;
for (int i = 0; i < word.Length - 1; i++)
{
var n = GetChildNode(current, word[i]);
if (n is not null)
{
current = n;
}
else
{
CharTrieNode node = new(word[i]);
AddToNode(current, node);
current = node;
}
}
var terminalNode = GetChildNode(current, word[^1]);
return (terminalNode, current);
}
internal void AddTerminalNode(CharTrieNode parent, CharTrieNode? existingNode, CharTrieNode newTerminalNode, string word)
{
if (existingNode is not null)
{
newTerminalNode.CopyChildren(existingNode.Children);
RemoveChildFromNode(parent, word[^1]);
}
AddToNode(parent, newTerminalNode);
Count++;
}
internal IEnumerable<TerminalCharTrieNode> GetTerminalNodesByPrefix(ReadOnlySpan<char> prefix)
{
var node = GetNode(prefix);
return GetTerminalNodes(node);
}
private IEnumerable<TerminalCharTrieNode> GetTerminalNodes(CharTrieNode? node)
{
if (node is null)
{
yield break;
}
if (node.IsTerminal)
{
yield return (TerminalCharTrieNode)node;
}
foreach (var n in GetDescendantTerminalNodes(node))
{
yield return n;
}
}
public IEnumerator<string> GetEnumerator() => GetAllTerminalNodes().Select(n => n.Word).GetEnumerator();
IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
void ICollection<string>.Add(string word)
{
if (string.IsNullOrWhiteSpace(word)) throw new ArgumentException(nameof(word));
Add(word);
}
void ICollection<string>.CopyTo(string[] array, int arrayIndex)
{
if(array == null) throw new ArgumentNullException(nameof(array));
if(arrayIndex < 0) throw new ArgumentOutOfRangeException(nameof(arrayIndex));
if (Count > array.Length - arrayIndex)
{
throw new ArgumentException(
"The number of elements in the trie is greater than the available space from index to the end of the destination array.");
}
foreach (var node in GetAllTerminalNodes())
{
array[arrayIndex++] = node.Word;
}
}
internal IEnumerable<TerminalCharTrieNode> GetAllTerminalNodes() => GetDescendantTerminalNodes(_root);
internal static IEnumerable<TerminalCharTrieNode> GetDescendantTerminalNodes(CharTrieNode node)
{
Queue<CharTrieNode> queue = new(node.Children);
while (queue.Count > 0)
{
var n = queue.Dequeue();
if (n.IsTerminal)
{
yield return (TerminalCharTrieNode)n;
}
for (var i = 0; i < n.Children.Length; i++)
{
queue.Enqueue(n.Children[i]);
}
}
}
internal int CalculateIntersection(ReadOnlySpan<char> prefix)
{
var current = _root;
for (var i = 0; i < prefix.Length; i++)
{
current = GetChildNode(current, prefix[i]);
if (current is null)
{
return i;
}
}
return prefix.Length;
}
internal CharTrieNode? GetNode(ReadOnlySpan<char> prefix)
{
var current = _root;
for (var i = 0; i < prefix.Length; i++)
{
current = GetChildNode(current, prefix[i]);
if (current is null)
{
return null;
}
}
return current;
}
internal IEnumerable<CharTrieNode> GetNodesByPattern(IReadOnlyList<Character> pattern)
{
Queue<(CharTrieNode node, int index)> queue = [];
queue.Enqueue((_root, 0));
while (queue.Count > 0)
{
var (node, index) = queue.Dequeue();
if (index == pattern.Count - 1)
{
if (pattern[index] != Character.Any)
{
var n = GetChildNode(node, pattern[index].Char);
if (n is not null)
{
yield return n;
}
}
else
{
for (var i = 0; i < node.Children.Length; i++)
{
yield return node.Children[i];
}
}
}
else
{
if (pattern[index] != Character.Any)
{
var n = GetChildNode(node, pattern[index].Char);
if (n is not null)
{
queue.Enqueue((n, index + 1));
}
}
else
{
for (var i = 0; i < node.Children.Length; i++)
{
queue.Enqueue((node.Children[i], index + 1));
}
}
}
}
}
private Stack<CharTrieNode> GetNodesForRemoval(string prefix)
{
var current = _root;
Stack<CharTrieNode> nodesUpToBottom = [];
nodesUpToBottom.Push(_root);
for (var i = 0; i < prefix.Length; i++)
{
var c = prefix[i];
current = GetChildNode(current, c);
if (current is not null)
{
nodesUpToBottom.Push(current);
}
else
{
return [];
}
}
return current.IsTerminal ? nodesUpToBottom : [];
}
private void RemoveNode(Stack<CharTrieNode> nodesUpToBottom)
{
Count--;
var node = nodesUpToBottom.Pop();
if (node.Children.Length == 0)
{
while (node.Children.Length == 0 && nodesUpToBottom.Count > 0)
{
var parent = nodesUpToBottom.Pop();
RemoveChildFromNode(parent, node.Key);
if (parent.IsTerminal) return;
node = parent;
}
}
else
{
// convert node to non-terminal node
CharTrieNode n = new(node.Key);
n.CopyChildren(node.Children);
var parent = nodesUpToBottom.Count == 0 ? _root : nodesUpToBottom.Pop();
RemoveChildFromNode(parent, node.Key);
AddToNode(parent, n);
}
}
private void AddToNode(CharTrieNode node, CharTrieNode nodeToAdd)
{
for (var i = 0; i < node.Children.Length; i++)
{
if (_comparer.Equals(nodeToAdd.Key, node.Children[i].Key))
{
return;
}
}
node.AddChild(nodeToAdd);
}
private void RemoveChildFromNode(CharTrieNode node, char key)
{
for (int i = 0; i < node.Children.Length; i++)
{
if (_comparer.Equals(key, node.Children[i].Key))
{
node.RemoveChildAt(i);
break;
}
}
}
private CharTrieNode? GetChildNode(CharTrieNode node, char key)
{
for (var i = 0; i < node.Children.Length; i++)
{
var n = node.Children[i];
if (_comparer.Equals(key, n.Key))
{
return n;
}
}
return null;
}
}
}

@ -0,0 +1,72 @@
using System;
namespace ZeroLevel.Services.Utils
{
public static class Utility
{
/// <summary>
/// Parse size in string notation into long.
/// Examples: 4k, 4K, 4KB, 4 KB, 8m, 8MB, 12g, 12 GB, 16t, 16 TB, 32p, 32 PB.
/// </summary>
/// <param name="value">String version of number</param>
/// <returns>The number</returns>
public static long ParseSize(string value)
{
char[] suffix = ['k', 'm', 'g', 't', 'p'];
long result = 0;
foreach (char c in value)
{
if (char.IsDigit(c))
{
result = result * 10 + (byte)c - '0';
}
else
{
for (int i = 0; i < suffix.Length; i++)
{
if (char.ToLower(c) == suffix[i])
{
result *= (long)Math.Pow(1024, i + 1);
return result;
}
}
}
}
return result;
}
/// <summary>
/// Pretty print value
/// </summary>
/// <param name="value"></param>
/// <returns></returns>
internal static string PrettySize(long value)
{
char[] suffix = ['K', 'M', 'G', 'T', 'P'];
double v = value;
int exp = 0;
while (v - Math.Floor(v) > 0)
{
if (exp >= 18)
break;
exp += 3;
v *= 1024;
v = Math.Round(v, 12);
}
while (Math.Floor(v).ToString().Length > 3)
{
if (exp <= -18)
break;
exp -= 3;
v /= 1024;
v = Math.Round(v, 12);
}
if (exp > 0)
return v.ToString() + suffix[exp / 3 - 1] + "B";
else if (exp < 0)
return v.ToString() + suffix[-exp / 3 - 1] + "B";
return v.ToString() + "B";
}
}
}
Loading…
Cancel
Save

Powered by TurnKey Linux.