|
|
|
|
using System;
|
|
|
|
|
using System.Collections.Concurrent;
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
using System.Linq;
|
|
|
|
|
using System.Threading;
|
|
|
|
|
using System.Threading.Tasks;
|
|
|
|
|
using ZeroLevel.DataStructures;
|
|
|
|
|
using ZeroLevel.Implementation.Semantic.Helpers;
|
|
|
|
|
using ZeroLevel.Services.Serialization;
|
|
|
|
|
|
|
|
|
|
namespace ZeroLevel.Services.Semantic.Helpers
|
|
|
|
|
{
|
|
|
|
|
public class BagOfTerms
|
|
|
|
|
{
|
|
|
|
|
private string[] _words;
|
|
|
|
|
private ILexProvider _lexer;
|
|
|
|
|
|
|
|
|
|
public BagOfTerms(string text) : this(TextAnalizer.ExtractWords(text).ToArray(), null!) { }
|
|
|
|
|
|
|
|
|
|
public BagOfTerms(string text, ILexProvider lexer) : this(TextAnalizer.ExtractWords(text).ToArray(), lexer) { }
|
|
|
|
|
|
|
|
|
|
public BagOfTerms(IEnumerable<string> words) : this(words.ToArray(), null!) { }
|
|
|
|
|
|
|
|
|
|
public BagOfTerms(IEnumerable<string> words, ILexProvider lexer) : this(words.ToArray(), lexer) { }
|
|
|
|
|
|
|
|
|
|
public BagOfTerms(string[] words) : this(words, null!) { }
|
|
|
|
|
|
|
|
|
|
public BagOfTerms(string[] words, ILexProvider lexer)
|
|
|
|
|
{
|
|
|
|
|
_lexer = lexer;
|
|
|
|
|
_frequency = null!;
|
|
|
|
|
_words = _lexer == null ? words : _lexer.ExtractLexTokens(words).Select(t => t.Token).ToArray();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public string[] Words => _words;
|
|
|
|
|
|
|
|
|
|
private IDictionary<string, int> _frequency;
|
|
|
|
|
|
|
|
|
|
public IDictionary<string, int> Freguency()
|
|
|
|
|
{
|
|
|
|
|
if (_frequency == null!)
|
|
|
|
|
{
|
|
|
|
|
var frequency = new Dictionary<string, int>();
|
|
|
|
|
for (int i = 0; i < _words.Length; i++)
|
|
|
|
|
{
|
|
|
|
|
if (frequency.ContainsKey(_words[i]))
|
|
|
|
|
{
|
|
|
|
|
frequency[_words[i]]++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
frequency[_words[i]] = 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
_frequency = frequency;
|
|
|
|
|
}
|
|
|
|
|
return _frequency;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public string[] ToTokens()
|
|
|
|
|
{
|
|
|
|
|
return _words;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public string[] ToUniqueTokens()
|
|
|
|
|
{
|
|
|
|
|
return _words.DistinctBy(s => s)
|
|
|
|
|
.ToArray();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public string[] ToUniqueTokensWithoutStopWords()
|
|
|
|
|
{
|
|
|
|
|
return _words.Where(w => StopWords.IsStopWord(w) == false)
|
|
|
|
|
.DistinctBy(s => s)
|
|
|
|
|
.ToArray();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public class BagOfWords1 :
|
|
|
|
|
IBinarySerializable
|
|
|
|
|
{
|
|
|
|
|
private ConcurrentDictionary<string, int[]> _words;
|
|
|
|
|
int _words_count = -1;
|
|
|
|
|
long _number_of_documents = 0;
|
|
|
|
|
|
|
|
|
|
public long NumberOfDocuments => _number_of_documents;
|
|
|
|
|
public int NumberOfWords => _words.Count;
|
|
|
|
|
|
|
|
|
|
public BagOfWords1() =>
|
|
|
|
|
_words = new ConcurrentDictionary<string, int[]>();
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Набор документов, слова в документе должны быть лемматизированы/стеммированы, и быть уникальными
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="documents"></param>
|
|
|
|
|
public void Learn(string[][] documents)
|
|
|
|
|
{
|
|
|
|
|
Parallel.ForEach(documents, doc =>
|
|
|
|
|
{
|
|
|
|
|
Interlocked.Increment(ref _number_of_documents);
|
|
|
|
|
var partition = new Dictionary<string, int[]>();
|
|
|
|
|
foreach (var word in doc)
|
|
|
|
|
{
|
|
|
|
|
if (!_words.ContainsKey(word))
|
|
|
|
|
{
|
|
|
|
|
if (false == _words.TryAdd(word, new int[2] { Interlocked.Increment(ref _words_count), 1 }))
|
|
|
|
|
{
|
|
|
|
|
Interlocked.Increment(ref _words[word][1]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
Interlocked.Increment(ref _words[word][1]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
///
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="doc">Документ - слова в котором должны быть лемматизированы/стеммированы, так же как в модели</param>
|
|
|
|
|
/// <returns></returns>
|
|
|
|
|
public SparceVector Transform(string[] doc)
|
|
|
|
|
{
|
|
|
|
|
if (doc == null || doc.Length == 0) return new SparceVector();
|
|
|
|
|
var map = new Dictionary<string, int>();
|
|
|
|
|
foreach (var word in doc)
|
|
|
|
|
{
|
|
|
|
|
if (map.ContainsKey(word))
|
|
|
|
|
{
|
|
|
|
|
map[word]++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
map[word] = 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
var result = new Dictionary<int, double>();
|
|
|
|
|
foreach (var word in doc)
|
|
|
|
|
{
|
|
|
|
|
if (_words.ContainsKey(word) && !result.ContainsKey(_words[word][0]))
|
|
|
|
|
{
|
|
|
|
|
var tf = (double)map[word] / (double)doc.Length;
|
|
|
|
|
var idf = Math.Log(1 + (_number_of_documents / _words[word][1]));
|
|
|
|
|
var tfidf = tf * idf;
|
|
|
|
|
if (Math.Abs(tfidf) > double.Epsilon)
|
|
|
|
|
{
|
|
|
|
|
result.Add(_words[word][0], tfidf);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return new SparceVector(result.Values.ToArray(), result.Keys.ToArray());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void Deserialize(IBinaryReader reader)
|
|
|
|
|
{
|
|
|
|
|
this._number_of_documents = reader.ReadLong();
|
|
|
|
|
this._words_count = reader.ReadInt32();
|
|
|
|
|
this._words = reader.ReadDictionaryAsConcurrent<string, int[]>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void Serialize(IBinaryWriter writer)
|
|
|
|
|
{
|
|
|
|
|
writer.WriteLong(this._number_of_documents);
|
|
|
|
|
writer.WriteInt32(this._words_count);
|
|
|
|
|
writer.WriteDictionary<string, int[]>(this._words);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|