You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Zero/TFIDFbee/TFIDFbee/Program.cs

168 lines
6.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

using Lemmatization;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using TFIDFbee.Reader;
using ZeroLevel;
using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Semantic.Helpers;
using ZeroLevel.Services.Serialization;
namespace TFIDFbee
{
class Program
{
private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204_limit_1000.json";
private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer());
private readonly static ConcurrentDictionary<string, double> _scoring = new ConcurrentDictionary<string, double>();
static void Main(string[] args)
{
var terms = new BagOfTerms("На практике эти расширения используют нечасто, особенно те расширения, которые для расчёта", _lexer);
Console.WriteLine(string.Join('-', terms.ToTokens()));
Console.WriteLine(string.Join('-', terms.ToUniqueTokens()));
Console.WriteLine(string.Join('-', terms.ToUniqueTokensWithoutStopWords()));
Console.WriteLine(string.Join('\n', terms.Freguency().Select(pair => $"{pair.Key}: {pair.Value}")));
/*
Log.AddConsoleLogger(ZeroLevel.Logging.LogLevel.FullDebug);
Configuration.Save(Configuration.ReadFromApplicationConfig());
IDocumentReader reader = new JsonByLineReader(source, s => ExtractLemmas(s));
ZeroLevel.Services.Semantic.Helpers.BagOfTerms codebook;
if (File.Exists("model.bin"))
{
Log.Info("Load model from file");
using (var stream = new FileStream("model.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
codebook = MessageSerializer.Deserialize<ZeroLevel.Services.Semantic.Helpers.BagOfTerms>(stream);
}
}
else
{
Log.Info("Create and train model");
codebook = new ZeroLevel.Services.Semantic.Helpers.BagOfTerms();
foreach (var batch in reader.ReadBatches(1000))
{
codebook.Learn(batch);
Log.Info($"\r\n\tDocuments: {codebook.NumberOfDocuments}\r\n\tWords: {codebook.NumberOfWords}");
}
using (var stream = new FileStream("model.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
{
MessageSerializer.Serialize<ZeroLevel.Services.Semantic.Helpers.BagOfTerms>(stream, codebook);
}
}
Log.Info("Create vectors");
foreach (var docs in reader.ReadRawDocumentBatches(1000))
{
foreach (var doc in docs)
{
var words = ExtractLemmas(doc.Item2).Concat(ExtractLemmas(doc.Item1)).Distinct().ToArray();
var vector = codebook.Transform(words);
for (var i = 0; i< words.Length; i++)
{
var word = words[i];
if (false == _scoring.ContainsKey(word))
{
_scoring.TryAdd(word, vector)
}
}
}
}
using (var stream = new FileStream("vectors.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
{
MessageSerializer.SerializeCompatible<List<SparceVector>>(stream, vectors);
}
Log.Info("Find similar documents");
var list = new List<Tuple<double, int, int>>();
long total_count = ((long)vectors.Count * (long)vectors.Count);
long count = 0;
double d = (double.Epsilon * 2.0d);
for (int i = 0; i < vectors.Count; i++)
{
for (int j = i + 1; j < vectors.Count - 1; j++)
{
count++;
if (count % 10000000 == 0)
{
Log.Info($"Progress: {((count * 100.0d) / total_count)} %.\tFound similars: {list.Count}.");
}
if (i == j) continue;
var diff = vectors[i].Measure(vectors[j]);
if (diff > d && diff < 0.0009d)
{
list.Add(Tuple.Create(diff, i, j));
}
}
}
Log.Info("Prepare to show similar documents");
var to_present = list.OrderBy(e => e.Item1).Take(2000).ToArray();
var to_present_map = new Dictionary<int, Tuple<string, string>>();
foreach (var e in to_present)
{
if (!to_present_map.ContainsKey(e.Item2))
{
to_present_map.Add(e.Item2, null);
}
if (!to_present_map.ContainsKey(e.Item3))
{
to_present_map.Add(e.Item3, null);
}
}
int index = 0;
foreach (var docs in reader.ReadRawDocumentBatches(1000))
{
foreach (var doc in docs)
{
if (to_present_map.ContainsKey(index))
{
to_present_map[index] = doc;
}
index++;
}
}
Log.Info("Show similar documents");
index = 0;
using (var output = new StreamWriter("out.txt"))
{
foreach (var e in to_present)
{
output.WriteLine($"#{index++}: {e.Item1}");
output.WriteLine("-------------1--------------");
output.WriteLine(to_present_map[e.Item2].Item1);
output.WriteLine(to_present_map[e.Item2].Item2);
output.WriteLine("-------------2--------------");
output.WriteLine(to_present_map[e.Item3].Item1);
output.WriteLine(to_present_map[e.Item3].Item2);
output.WriteLine("#############################");
output.WriteLine();
}
}
*/
Console.WriteLine("Completed");
Console.ReadKey();
}
private static IEnumerable<string> ExtractLemmas(string text)
{
return
_lexer.ExtractUniqueLexTokensWithoutStopWords(text)
.Select(t => t.Token)
.Where(s => s.Any(c => char.IsLetter(c)));
}
}
}

Powered by TurnKey Linux.