pull/1/head
Ogoun 5 years ago
parent c9b9eadad6
commit bcb30bc693

@ -1,5 +1,6 @@
using Lemmatization; using Lemmatization;
using System; using System;
using System.Collections.Concurrent;
using System.Collections.Generic; using System.Collections.Generic;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
@ -13,28 +14,39 @@ namespace TFIDFbee
{ {
class Program class Program
{ {
private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204.json"; private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204_limit_1000.json";
private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer()); private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer());
private readonly static ConcurrentDictionary<string, double> _scoring = new ConcurrentDictionary<string, double>();
static void Main(string[] args) static void Main(string[] args)
{ {
var terms = new BagOfTerms("На практике эти расширения используют нечасто, особенно те расширения, которые для расчёта", _lexer);
Console.WriteLine(string.Join('-', terms.ToTokens()));
Console.WriteLine(string.Join('-', terms.ToUniqueTokens()));
Console.WriteLine(string.Join('-', terms.ToUniqueTokensWithoutStopWords()));
Console.WriteLine(string.Join('\n', terms.Freguency().Select(pair => $"{pair.Key}: {pair.Value}")));
/*
Log.AddConsoleLogger(ZeroLevel.Logging.LogLevel.FullDebug); Log.AddConsoleLogger(ZeroLevel.Logging.LogLevel.FullDebug);
Configuration.Save(Configuration.ReadFromApplicationConfig()); Configuration.Save(Configuration.ReadFromApplicationConfig());
IDocumentReader reader = new StateMachineReader(source, s => ExtractLemmas(s)); IDocumentReader reader = new JsonByLineReader(source, s => ExtractLemmas(s));
BagOfWords codebook; ZeroLevel.Services.Semantic.Helpers.BagOfTerms codebook;
if (File.Exists("model.bin")) if (File.Exists("model.bin"))
{ {
Log.Info("Load model from file"); Log.Info("Load model from file");
using (var stream = new FileStream("model.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var stream = new FileStream("model.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{ {
codebook = MessageSerializer.Deserialize<BagOfWords>(stream); codebook = MessageSerializer.Deserialize<ZeroLevel.Services.Semantic.Helpers.BagOfTerms>(stream);
} }
} }
else else
{ {
Log.Info("Create and train model"); Log.Info("Create and train model");
codebook = new BagOfWords(); codebook = new ZeroLevel.Services.Semantic.Helpers.BagOfTerms();
foreach (var batch in reader.ReadBatches(1000)) foreach (var batch in reader.ReadBatches(1000))
{ {
codebook.Learn(batch); codebook.Learn(batch);
@ -42,54 +54,52 @@ namespace TFIDFbee
} }
using (var stream = new FileStream("model.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite)) using (var stream = new FileStream("model.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
{ {
MessageSerializer.Serialize<BagOfWords>(stream, codebook); MessageSerializer.Serialize<ZeroLevel.Services.Semantic.Helpers.BagOfTerms>(stream, codebook);
} }
} }
Log.Info("Build document vectors");
List<SparceVector> vectors;
if (File.Exists("vectors.bin"))
{
Log.Info("Load vectors from file");
using (var stream = new FileStream("vectors.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
vectors = MessageSerializer.DeserializeCompatible<List<SparceVector>>(stream);
}
}
else
{
Log.Info("Create vectors"); Log.Info("Create vectors");
vectors = new List<SparceVector>();
foreach (var docs in reader.ReadRawDocumentBatches(1000)) foreach (var docs in reader.ReadRawDocumentBatches(1000))
{ {
foreach (var doc in docs) foreach (var doc in docs)
{ {
var words = _lexer.ExtractLexTokens(doc.Item2).Select(t => t.Token).Concat(_lexer.ExtractLexTokens(doc.Item1).Select(t => t.Token)).ToArray(); var words = ExtractLemmas(doc.Item2).Concat(ExtractLemmas(doc.Item1)).Distinct().ToArray();
vectors.Add(codebook.Transform(words)); var vector = codebook.Transform(words);
for (var i = 0; i< words.Length; i++)
{
var word = words[i];
if (false == _scoring.ContainsKey(word))
{
_scoring.TryAdd(word, vector)
}
}
} }
} }
using (var stream = new FileStream("vectors.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite)) using (var stream = new FileStream("vectors.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
{ {
MessageSerializer.SerializeCompatible<List<SparceVector>>(stream, vectors); MessageSerializer.SerializeCompatible<List<SparceVector>>(stream, vectors);
} }
}
Log.Info("Find similar documents"); Log.Info("Find similar documents");
var list = new List<Tuple<double, int, int>>(); var list = new List<Tuple<double, int, int>>();
long total_count = (vectors.Count * vectors.Count); long total_count = ((long)vectors.Count * (long)vectors.Count);
long count = 0; long count = 0;
double d = (double.Epsilon * 2.0d);
for (int i = 0; i < vectors.Count; i++) for (int i = 0; i < vectors.Count; i++)
{ {
for (int j = i + 1; j < vectors.Count - 1; j++) for (int j = i + 1; j < vectors.Count - 1; j++)
{ {
count++; count++;
if (count % 100000 == 0) if (count % 10000000 == 0)
{ {
Log.Info($"Progress: {(int)(count * 100.0d / (double)total_count)} %.\tFound similars: {list.Count}."); Log.Info($"Progress: {((count * 100.0d) / total_count)} %.\tFound similars: {list.Count}.");
} }
if (i == j) continue; if (i == j) continue;
var diff = vectors[i].Measure(vectors[j]); var diff = vectors[i].Measure(vectors[j]);
if (diff > 0.885d) if (diff > d && diff < 0.0009d)
{ {
list.Add(Tuple.Create(diff, i, j)); list.Add(Tuple.Create(diff, i, j));
} }
@ -141,7 +151,7 @@ namespace TFIDFbee
output.WriteLine(); output.WriteLine();
} }
} }
*/
Console.WriteLine("Completed"); Console.WriteLine("Completed");
Console.ReadKey(); Console.ReadKey();
} }

@ -10,18 +10,24 @@ namespace ZeroLevel.Services.Semantic
/// <returns>Spisok tokenov</returns> /// <returns>Spisok tokenov</returns>
IEnumerable<LexToken> ExtractLexTokens(string text); IEnumerable<LexToken> ExtractLexTokens(string text);
IEnumerable<LexToken> ExtractLexTokens(string[] words);
/// <summary> /// <summary>
/// Selecting unique tokens from text /// Selecting unique tokens from text
/// </summary> /// </summary>
/// <returns>Tokens</returns> /// <returns>Tokens</returns>
IEnumerable<LexToken> ExtractUniqueLexTokens(string text); IEnumerable<LexToken> ExtractUniqueLexTokens(string text);
IEnumerable<LexToken> ExtractUniqueLexTokens(string[] words);
/// <summary> /// <summary>
/// Allocation of unique tokens from text with drop of stop words /// Allocation of unique tokens from text with drop of stop words
/// </summary> /// </summary>
/// <returns>Tokens</returns> /// <returns>Tokens</returns>
IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string text); IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string text);
IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string[] words);
/// <summary> /// <summary>
/// Search for tokens in the text corresponding to the specified words (full-text search) /// Search for tokens in the text corresponding to the specified words (full-text search)
/// </summary> /// </summary>

@ -1,14 +1,84 @@
using System; using Iveonik.Stemmers;
using System;
using System.Collections.Concurrent; using System.Collections.Concurrent;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Threading; using System.Threading;
using System.Threading.Tasks; using System.Threading.Tasks;
using ZeroLevel.Implementation.Semantic.Helpers;
using ZeroLevel.Services.Serialization; using ZeroLevel.Services.Serialization;
namespace ZeroLevel.Services.Semantic.Helpers namespace ZeroLevel.Services.Semantic.Helpers
{ {
public class BagOfWords : public class BagOfTerms
{
private string[] _words;
private ILexProvider _lexer;
public BagOfTerms(string text) : this(TextAnalizer.ExtractWords(text).ToArray(), new LexProvider(new RussianStemmer())) { }
public BagOfTerms(string text, ILexProvider lexer) : this(TextAnalizer.ExtractWords(text).ToArray(), lexer) { }
public BagOfTerms(IEnumerable<string> words) : this(words.ToArray(), new LexProvider(new RussianStemmer())) { }
public BagOfTerms(IEnumerable<string> words, ILexProvider lexer) : this(words.ToArray(), lexer) { }
public BagOfTerms(string[] words) : this(words, new LexProvider(new RussianStemmer())) { }
public BagOfTerms(string[] words, ILexProvider lexer)
{
_lexer = lexer;
_frequency = null;
_words = _lexer.ExtractLexTokens(words).Select(t => t.Token).ToArray();
}
public string[] Words => _words;
private IDictionary<string, int> _frequency;
public IDictionary<string, int> Freguency()
{
if (_frequency == null)
{
var frequency = new Dictionary<string, int>();
for (int i = 0; i < _words.Length; i++)
{
if (frequency.ContainsKey(_words[i]))
{
frequency[_words[i]]++;
}
else
{
frequency[_words[i]] = 1;
}
}
_frequency = frequency;
}
return _frequency;
}
public string[] ToTokens()
{
return _words;
}
public string[] ToUniqueTokens()
{
return _words.DistinctBy(s => s)
.ToArray();
}
public string[] ToUniqueTokensWithoutStopWords()
{
return _words.Where(w => StopWords.IsStopWord(w) == false)
.DistinctBy(s => s)
.ToArray();
}
}
public class BagOfWords1 :
IBinarySerializable IBinarySerializable
{ {
private ConcurrentDictionary<string, int[]> _words; private ConcurrentDictionary<string, int[]> _words;
@ -18,7 +88,7 @@ namespace ZeroLevel.Services.Semantic.Helpers
public long NumberOfDocuments => _number_of_documents; public long NumberOfDocuments => _number_of_documents;
public int NumberOfWords => _words.Count; public int NumberOfWords => _words.Count;
public BagOfWords() => public BagOfWords1() =>
_words = new ConcurrentDictionary<string, int[]>(); _words = new ConcurrentDictionary<string, int[]>();
/// <summary> /// <summary>
@ -74,7 +144,7 @@ namespace ZeroLevel.Services.Semantic.Helpers
if (_words.ContainsKey(word) && !result.ContainsKey(_words[word][0])) if (_words.ContainsKey(word) && !result.ContainsKey(_words[word][0]))
{ {
var tf = (double)map[word] / (double)doc.Length; var tf = (double)map[word] / (double)doc.Length;
var idf = Math.Log(_number_of_documents / _words[word][1]); var idf = Math.Log(1 + (_number_of_documents / _words[word][1]));
var tfidf = tf * idf; var tfidf = tf * idf;
if (Math.Abs(tfidf) > double.Epsilon) if (Math.Abs(tfidf) > double.Epsilon)
{ {

@ -14,6 +14,8 @@ namespace ZeroLevel.Services.Semantic.Helpers
private double[] values; private double[] values;
private double power; private double power;
public SparceVector() public SparceVector()
{ {
indexes = EmptyIndexes; indexes = EmptyIndexes;

@ -62,6 +62,16 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
return result; return result;
} }
public static IEnumerable<WordToken> ExtractWordTokens(string[] words)
{
var result = new List<WordToken>();
for (int i = 0; i < words.Length; i++)
{
result.Add(new WordToken(words[i], i));
}
return result;
}
/// <summary> /// <summary>
/// Selection of unique tokens from the text (first entry) /// Selection of unique tokens from the text (first entry)
/// </summary> /// </summary>
@ -72,6 +82,11 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
return ExtractWordTokens(text).DistinctBy(t => t.Word); return ExtractWordTokens(text).DistinctBy(t => t.Word);
} }
public static IEnumerable<WordToken> ExtractUniqueWordTokens(string[] words)
{
return ExtractWordTokens(words).DistinctBy(t => t.Word);
}
/// <summary> /// <summary>
/// Allocation of unique tokens from text with drop of stop words /// Allocation of unique tokens from text with drop of stop words
/// </summary> /// </summary>
@ -81,5 +96,10 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
{ {
return ExtractWordTokens(text).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false); return ExtractWordTokens(text).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false);
} }
public static IEnumerable<WordToken> ExtractUniqueWordTokensWithoutStopWords(string[] words)
{
return ExtractWordTokens(words).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false);
}
} }
} }

@ -25,7 +25,16 @@ namespace ZeroLevel.Services.Semantic
{ {
result.Add(new LexToken(match.Value, _lexer.Lex(match.Value), match.Index)); result.Add(new LexToken(match.Value, _lexer.Lex(match.Value), match.Index));
} }
return result;
}
public IEnumerable<LexToken> ExtractLexTokens(string[] words)
{
var result = new List<LexToken>();
for(int i=0; i < words.Length; i++)
{
result.Add(new LexToken(words[i], _lexer.Lex(words[i]), i));
}
return result; return result;
} }
@ -35,12 +44,24 @@ namespace ZeroLevel.Services.Semantic
.Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token); .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
} }
public IEnumerable<LexToken> ExtractUniqueLexTokens(string[] words)
{
return TextAnalizer.ExtractUniqueWordTokens(words)
.Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
}
public IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string text) public IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string text)
{ {
return TextAnalizer.ExtractUniqueWordTokensWithoutStopWords(text) return TextAnalizer.ExtractUniqueWordTokensWithoutStopWords(text)
.Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token); .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
} }
public IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string[] words)
{
return TextAnalizer.ExtractUniqueWordTokensWithoutStopWords(words)
.Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
}
public IDictionary<string, IEnumerable<LexToken>> SearchLexTokensByWords(string text, string[] words) public IDictionary<string, IEnumerable<LexToken>> SearchLexTokensByWords(string text, string[] words)
{ {
var result = new Dictionary<string, IEnumerable<LexToken>>(); var result = new Dictionary<string, IEnumerable<LexToken>>();

Loading…
Cancel
Save

Powered by TurnKey Linux.