using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading; using System.Threading.Tasks; using ZeroLevel.Services.Serialization; namespace ZeroLevel.Services.Semantic.Helpers { public class BagOfWords : IBinarySerializable { private ConcurrentDictionary _words; int _words_count = -1; long _number_of_documents = 0; public long NumberOfDocuments => _number_of_documents; public int NumberOfWords => _words.Count; public BagOfWords() => _words = new ConcurrentDictionary(); /// /// Набор документов, слова в документе должны быть лемматизированы/стеммированы, и быть уникальными /// /// public void Learn(string[][] documents) { Parallel.ForEach(documents, doc => { Interlocked.Increment(ref _number_of_documents); var partition = new Dictionary(); foreach (var word in doc) { if (!_words.ContainsKey(word)) { if (false == _words.TryAdd(word, new int[2] { Interlocked.Increment(ref _words_count), 1 })) { Interlocked.Increment(ref _words[word][1]); } } else { Interlocked.Increment(ref _words[word][1]); } } }); } /// /// /// /// Документ - слова в котором должны быть лемматизированы/стеммированы, так же как в модели /// public SparceVector Transform(string[] doc) { if (doc == null || doc.Length == 0) return new SparceVector(); var map = new Dictionary(); foreach (var word in doc) { if (map.ContainsKey(word)) { map[word]++; } else { map[word] = 1; } } var result = new Dictionary(); foreach (var word in doc) { if (_words.ContainsKey(word) && !result.ContainsKey(_words[word][0])) { var tf = (double)map[word] / (double)doc.Length; var idf = Math.Log(_number_of_documents / _words[word][1]); var tfidf = tf * idf; if (Math.Abs(tfidf) > double.Epsilon) { result.Add(_words[word][0], tfidf); } } } return new SparceVector(result.Values.ToArray(), result.Keys.ToArray()); } public void Deserialize(IBinaryReader reader) { this._number_of_documents = reader.ReadLong(); this._words_count = reader.ReadInt32(); this._words = reader.ReadDictionaryAsConcurrent(); } public void Serialize(IBinaryWriter writer) { writer.WriteLong(this._number_of_documents); writer.WriteInt32(this._words_count); writer.WriteDictionary(this._words); } } }