diff --git a/TestApp/Program.cs b/TestApp/Program.cs index 4f0c0ed..98085ac 100644 --- a/TestApp/Program.cs +++ b/TestApp/Program.cs @@ -1,4 +1,5 @@ using System; +using System.Net; using ZeroLevel; namespace TestApp @@ -18,4 +19,4 @@ namespace TestApp Bootstrap.Shutdown(); } } -} +} \ No newline at end of file diff --git a/ZeroLevel.UnitTests/SemanticTests.cs b/ZeroLevel.UnitTests/SemanticTests.cs new file mode 100644 index 0000000..ba16b1d --- /dev/null +++ b/ZeroLevel.UnitTests/SemanticTests.cs @@ -0,0 +1,34 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using Xunit; +using ZeroLevel.Services.Semantic; + +namespace ZeroLevel.UnitTests +{ + public class SemanticTests + { + [Fact] + public void WordTokenizerTest() + { + // Arrange + var line = "Хорошее понимание проекций, отражений и векторных операций (как в истинном значении скалярного (dot) и векторного (cross) произведений векторов) обычно приходит с растущим чувством беспокойства при использованием тригонометрии. "; + var test = new string[] { + "хорошее", "понимание", "проекций", "отражений", "и" + , "векторных", "операций", "как", "в", "истинном" + , "значении", "скалярного","dot","и","векторного","cross","произведений" + ,"векторов","обычно","приходит","с","растущим","чувством","беспокойства" + ,"при","использованием", "тригонометрии"}; + // Act + var terms = WordTokenizer.Tokenize(line).ToArray(); + // Assert + + Assert.True(test.Length == terms.Length); + for (int i = 0; i < terms.Length; i++) + { + Assert.True(string.CompareOrdinal(test[i], terms[i]) == 0); + } + } + } +} diff --git a/ZeroLevel/Services/Semantic/LanguageDictionary.cs b/ZeroLevel/Services/Semantic/LanguageDictionary.cs new file mode 100644 index 0000000..f44f5fd --- /dev/null +++ b/ZeroLevel/Services/Semantic/LanguageDictionary.cs @@ -0,0 +1,36 @@ +using System.Collections.Generic; +using ZeroLevel.Services.Semantic; +using ZeroLevel.Services.Serialization; + +namespace ZeroLevel.Services.Semantic +{ + public class LanguageDictionary + : IBinarySerializable + { + private Trie _words = new Trie(); + public uint this[string word] => _words.Key(word) ?? 0; + public string Word(uint key) => _words.Word(key); + + public IEnumerable Keys => _words.Keys; + + public void Append(string word) + { + _words.Append(word.Normalize()); + } + + public void Deserialize(IBinaryReader reader) + { + this._words = reader.Read(); + } + + public void Serialize(IBinaryWriter writer) + { + writer.Write(this._words); + } + + public void ToggleReverseIndex(bool enabled) + { + _words.ToggleReverseIndex(enabled); + } + } +} diff --git a/ZeroLevel/Services/Semantic/Trie.cs b/ZeroLevel/Services/Semantic/Trie.cs index 472c494..31effa7 100644 --- a/ZeroLevel/Services/Semantic/Trie.cs +++ b/ZeroLevel/Services/Semantic/Trie.cs @@ -12,7 +12,7 @@ namespace ZeroLevel.Services.Semantic internal class TrieNode : IBinarySerializable { - public char? Key; // settet only with rebuild index + public char? Key; // setted only with rebuild index public uint? Value; public TrieNode Parent; public ConcurrentDictionary Children; diff --git a/ZeroLevel/Services/Semantic/WordTokenizer.cs b/ZeroLevel/Services/Semantic/WordTokenizer.cs new file mode 100644 index 0000000..17490d2 --- /dev/null +++ b/ZeroLevel/Services/Semantic/WordTokenizer.cs @@ -0,0 +1,47 @@ +using System; +using System.Collections.Generic; +using ZeroLevel.Services.Pools; + +namespace ZeroLevel.Services.Semantic +{ + public static class WordTokenizer + { + static ObjectPool _pool = new ObjectPool(() => new char[2048]); + + public static IEnumerable Tokenize(string text) + { + int index = 0; + bool first = true; + var buffer = _pool.Allocate(); + try + { + for (int i = 0; i < text.Length; i++) + { + if (first && Char.IsLetter(text[i])) + { + first = false; + buffer[index++] = text[i]; + } + else if (first == false && Char.IsLetterOrDigit(text[i])) + { + buffer[index++] = text[i]; + } + else if (index > 0) + { + yield return new string(buffer, 0, index).ToLowerInvariant(); + index = 0; + first = true; + } + } + if (index > 0) + { + yield return new string(buffer, 0, index).ToLowerInvariant(); + } + } + finally + { + _pool.Free(buffer); + } + } + } +} diff --git a/ZeroLevel/Services/Semantic/WordsDictionary.cs b/ZeroLevel/Services/Semantic/WordsDictionary.cs new file mode 100644 index 0000000..5c64e25 --- /dev/null +++ b/ZeroLevel/Services/Semantic/WordsDictionary.cs @@ -0,0 +1,64 @@ +using System.Collections.Concurrent; +using System.Threading; +using ZeroLevel.Services.Serialization; + +namespace ZeroLevel.Services.Semantic +{ + public class WordsDictionary + : IBinarySerializable + { + private ConcurrentDictionary _dicts = new ConcurrentDictionary(); + private ReaderWriterLockSlim _lock = new ReaderWriterLockSlim(); + + public LanguageDictionary this[string lang] + { + get + { + if (_dicts.ContainsKey(lang) == false) + { + _lock.EnterWriteLock(); + try + { + if (_dicts.ContainsKey(lang) == false) + { + _dicts[lang] = new LanguageDictionary(); + } + } + finally + { + _lock.ExitWriteLock(); + } + } + return _dicts[lang]; + } + } + public void ToggleReverseIndex(bool enabled) + { + foreach (var pair in _dicts) + { + pair.Value.ToggleReverseIndex(enabled); + } + } + public void Deserialize(IBinaryReader reader) + { + int count = reader.ReadInt32(); + this._dicts = new ConcurrentDictionary(); + string key; + for (int i = 0; i < count; i++) + { + key = reader.ReadString(); + this._dicts.TryAdd(key, reader.Read()); + } + } + + public void Serialize(IBinaryWriter writer) + { + writer.WriteInt32(_dicts.Count); + foreach (var pair in _dicts) + { + writer.WriteString(pair.Key); + writer.Write(pair.Value); + } + } + } +}