pull/1/head
Ogoun 5 years ago
parent c9b9eadad6
commit bcb30bc693

@ -1,5 +1,6 @@
using Lemmatization;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
using System.Linq;
@ -13,28 +14,39 @@ namespace TFIDFbee
{
class Program
{
private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204.json";
private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204_limit_1000.json";
private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer());
private readonly static ConcurrentDictionary<string, double> _scoring = new ConcurrentDictionary<string, double>();
static void Main(string[] args)
{
var terms = new BagOfTerms("На практике эти расширения используют нечасто, особенно те расширения, которые для расчёта", _lexer);
Console.WriteLine(string.Join('-', terms.ToTokens()));
Console.WriteLine(string.Join('-', terms.ToUniqueTokens()));
Console.WriteLine(string.Join('-', terms.ToUniqueTokensWithoutStopWords()));
Console.WriteLine(string.Join('\n', terms.Freguency().Select(pair => $"{pair.Key}: {pair.Value}")));
/*
Log.AddConsoleLogger(ZeroLevel.Logging.LogLevel.FullDebug);
Configuration.Save(Configuration.ReadFromApplicationConfig());
IDocumentReader reader = new StateMachineReader(source, s => ExtractLemmas(s));
IDocumentReader reader = new JsonByLineReader(source, s => ExtractLemmas(s));
BagOfWords codebook;
ZeroLevel.Services.Semantic.Helpers.BagOfTerms codebook;
if (File.Exists("model.bin"))
{
Log.Info("Load model from file");
using (var stream = new FileStream("model.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
codebook = MessageSerializer.Deserialize<BagOfWords>(stream);
codebook = MessageSerializer.Deserialize<ZeroLevel.Services.Semantic.Helpers.BagOfTerms>(stream);
}
}
else
{
Log.Info("Create and train model");
codebook = new BagOfWords();
codebook = new ZeroLevel.Services.Semantic.Helpers.BagOfTerms();
foreach (var batch in reader.ReadBatches(1000))
{
codebook.Learn(batch);
@ -42,54 +54,52 @@ namespace TFIDFbee
}
using (var stream = new FileStream("model.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
{
MessageSerializer.Serialize<BagOfWords>(stream, codebook);
MessageSerializer.Serialize<ZeroLevel.Services.Semantic.Helpers.BagOfTerms>(stream, codebook);
}
}
Log.Info("Build document vectors");
List<SparceVector> vectors;
if (File.Exists("vectors.bin"))
{
Log.Info("Load vectors from file");
using (var stream = new FileStream("vectors.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
vectors = MessageSerializer.DeserializeCompatible<List<SparceVector>>(stream);
}
}
else
Log.Info("Create vectors");
foreach (var docs in reader.ReadRawDocumentBatches(1000))
{
Log.Info("Create vectors");
vectors = new List<SparceVector>();
foreach (var docs in reader.ReadRawDocumentBatches(1000))
foreach (var doc in docs)
{
foreach (var doc in docs)
var words = ExtractLemmas(doc.Item2).Concat(ExtractLemmas(doc.Item1)).Distinct().ToArray();
var vector = codebook.Transform(words);
for (var i = 0; i< words.Length; i++)
{
var words = _lexer.ExtractLexTokens(doc.Item2).Select(t => t.Token).Concat(_lexer.ExtractLexTokens(doc.Item1).Select(t => t.Token)).ToArray();
vectors.Add(codebook.Transform(words));
var word = words[i];
if (false == _scoring.ContainsKey(word))
{
_scoring.TryAdd(word, vector)
}
}
}
using (var stream = new FileStream("vectors.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
{
MessageSerializer.SerializeCompatible<List<SparceVector>>(stream, vectors);
}
}
using (var stream = new FileStream("vectors.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
{
MessageSerializer.SerializeCompatible<List<SparceVector>>(stream, vectors);
}
Log.Info("Find similar documents");
var list = new List<Tuple<double, int, int>>();
long total_count = (vectors.Count * vectors.Count);
long total_count = ((long)vectors.Count * (long)vectors.Count);
long count = 0;
double d = (double.Epsilon * 2.0d);
for (int i = 0; i < vectors.Count; i++)
{
for (int j = i + 1; j < vectors.Count - 1; j++)
{
count++;
if (count % 100000 == 0)
if (count % 10000000 == 0)
{
Log.Info($"Progress: {(int)(count * 100.0d / (double)total_count)} %.\tFound similars: {list.Count}.");
Log.Info($"Progress: {((count * 100.0d) / total_count)} %.\tFound similars: {list.Count}.");
}
if (i == j) continue;
var diff = vectors[i].Measure(vectors[j]);
if (diff > 0.885d)
if (diff > d && diff < 0.0009d)
{
list.Add(Tuple.Create(diff, i, j));
}
@ -141,7 +151,7 @@ namespace TFIDFbee
output.WriteLine();
}
}
*/
Console.WriteLine("Completed");
Console.ReadKey();
}

@ -10,18 +10,24 @@ namespace ZeroLevel.Services.Semantic
/// <returns>Spisok tokenov</returns>
IEnumerable<LexToken> ExtractLexTokens(string text);
IEnumerable<LexToken> ExtractLexTokens(string[] words);
/// <summary>
/// Selecting unique tokens from text
/// </summary>
/// <returns>Tokens</returns>
IEnumerable<LexToken> ExtractUniqueLexTokens(string text);
IEnumerable<LexToken> ExtractUniqueLexTokens(string[] words);
/// <summary>
/// Allocation of unique tokens from text with drop of stop words
/// </summary>
/// <returns>Tokens</returns>
IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string text);
IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string[] words);
/// <summary>
/// Search for tokens in the text corresponding to the specified words (full-text search)
/// </summary>

@ -1,14 +1,84 @@
using System;
using Iveonik.Stemmers;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using ZeroLevel.Implementation.Semantic.Helpers;
using ZeroLevel.Services.Serialization;
namespace ZeroLevel.Services.Semantic.Helpers
{
public class BagOfWords :
public class BagOfTerms
{
private string[] _words;
private ILexProvider _lexer;
public BagOfTerms(string text) : this(TextAnalizer.ExtractWords(text).ToArray(), new LexProvider(new RussianStemmer())) { }
public BagOfTerms(string text, ILexProvider lexer) : this(TextAnalizer.ExtractWords(text).ToArray(), lexer) { }
public BagOfTerms(IEnumerable<string> words) : this(words.ToArray(), new LexProvider(new RussianStemmer())) { }
public BagOfTerms(IEnumerable<string> words, ILexProvider lexer) : this(words.ToArray(), lexer) { }
public BagOfTerms(string[] words) : this(words, new LexProvider(new RussianStemmer())) { }
public BagOfTerms(string[] words, ILexProvider lexer)
{
_lexer = lexer;
_frequency = null;
_words = _lexer.ExtractLexTokens(words).Select(t => t.Token).ToArray();
}
public string[] Words => _words;
private IDictionary<string, int> _frequency;
public IDictionary<string, int> Freguency()
{
if (_frequency == null)
{
var frequency = new Dictionary<string, int>();
for (int i = 0; i < _words.Length; i++)
{
if (frequency.ContainsKey(_words[i]))
{
frequency[_words[i]]++;
}
else
{
frequency[_words[i]] = 1;
}
}
_frequency = frequency;
}
return _frequency;
}
public string[] ToTokens()
{
return _words;
}
public string[] ToUniqueTokens()
{
return _words.DistinctBy(s => s)
.ToArray();
}
public string[] ToUniqueTokensWithoutStopWords()
{
return _words.Where(w => StopWords.IsStopWord(w) == false)
.DistinctBy(s => s)
.ToArray();
}
}
public class BagOfWords1 :
IBinarySerializable
{
private ConcurrentDictionary<string, int[]> _words;
@ -18,7 +88,7 @@ namespace ZeroLevel.Services.Semantic.Helpers
public long NumberOfDocuments => _number_of_documents;
public int NumberOfWords => _words.Count;
public BagOfWords() =>
public BagOfWords1() =>
_words = new ConcurrentDictionary<string, int[]>();
/// <summary>
@ -74,7 +144,7 @@ namespace ZeroLevel.Services.Semantic.Helpers
if (_words.ContainsKey(word) && !result.ContainsKey(_words[word][0]))
{
var tf = (double)map[word] / (double)doc.Length;
var idf = Math.Log(_number_of_documents / _words[word][1]);
var idf = Math.Log(1 + (_number_of_documents / _words[word][1]));
var tfidf = tf * idf;
if (Math.Abs(tfidf) > double.Epsilon)
{

@ -14,6 +14,8 @@ namespace ZeroLevel.Services.Semantic.Helpers
private double[] values;
private double power;
public SparceVector()
{
indexes = EmptyIndexes;

@ -62,6 +62,16 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
return result;
}
public static IEnumerable<WordToken> ExtractWordTokens(string[] words)
{
var result = new List<WordToken>();
for (int i = 0; i < words.Length; i++)
{
result.Add(new WordToken(words[i], i));
}
return result;
}
/// <summary>
/// Selection of unique tokens from the text (first entry)
/// </summary>
@ -72,6 +82,11 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
return ExtractWordTokens(text).DistinctBy(t => t.Word);
}
public static IEnumerable<WordToken> ExtractUniqueWordTokens(string[] words)
{
return ExtractWordTokens(words).DistinctBy(t => t.Word);
}
/// <summary>
/// Allocation of unique tokens from text with drop of stop words
/// </summary>
@ -81,5 +96,10 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
{
return ExtractWordTokens(text).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false);
}
public static IEnumerable<WordToken> ExtractUniqueWordTokensWithoutStopWords(string[] words)
{
return ExtractWordTokens(words).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false);
}
}
}

@ -25,7 +25,16 @@ namespace ZeroLevel.Services.Semantic
{
result.Add(new LexToken(match.Value, _lexer.Lex(match.Value), match.Index));
}
return result;
}
public IEnumerable<LexToken> ExtractLexTokens(string[] words)
{
var result = new List<LexToken>();
for(int i=0; i < words.Length; i++)
{
result.Add(new LexToken(words[i], _lexer.Lex(words[i]), i));
}
return result;
}
@ -35,12 +44,24 @@ namespace ZeroLevel.Services.Semantic
.Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
}
public IEnumerable<LexToken> ExtractUniqueLexTokens(string[] words)
{
return TextAnalizer.ExtractUniqueWordTokens(words)
.Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
}
public IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string text)
{
return TextAnalizer.ExtractUniqueWordTokensWithoutStopWords(text)
.Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
}
public IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string[] words)
{
return TextAnalizer.ExtractUniqueWordTokensWithoutStopWords(words)
.Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
}
public IDictionary<string, IEnumerable<LexToken>> SearchLexTokensByWords(string text, string[] words)
{
var result = new Dictionary<string, IEnumerable<LexToken>>();

Loading…
Cancel
Save

Powered by TurnKey Linux.