fix keyword extraction

pull/1/head
Ogoun 5 years ago
parent bcb30bc693
commit cc54d87d87

@ -2,31 +2,81 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading;
using TFIDFbee.Reader;
using ZeroLevel;
using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Semantic.Helpers;
using ZeroLevel.Services.Serialization;
namespace TFIDFbee
{
public class IDF
{
private ConcurrentDictionary<string, int> _terms =
new ConcurrentDictionary<string, int>();
private long _documents_count = 0;
public void Learn(BagOfTerms bag)
{
_documents_count++;
foreach (var term in bag.ToUniqueTokens())
{
_terms.AddOrUpdate(term, 1, (w, o) => o + 1);
}
}
public double Idf(string term)
{
if (_terms.ContainsKey(term))
{
double count_documents_with_term = (double)_terms[term];
double total_documents = (double)_documents_count;
return Math.Log(1.0d + (total_documents / count_documents_with_term));
}
return 0.0d;
}
}
public static class TFIDF
{
public static IDictionary<string, double> TfIdf(BagOfTerms document, IDF idf)
{
var freg = document.Freguency();
return document
.ToUniqueTokensWithoutStopWords()
.ToDictionary(t => t, t => idf.Idf(t) * (double)freg[t] / (double)document.Words.Length);
}
}
class Program
{
private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204_limit_1000.json";
private const string source = @"D:\Desktop\lenta-ru-data-set_19990901_20171204_limit_1000.json";
private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer());
private readonly static ConcurrentDictionary<string, double> _scoring = new ConcurrentDictionary<string, double>();
static void Main(string[] args)
{
var terms = new BagOfTerms("На практике эти расширения используют нечасто, особенно те расширения, которые для расчёта", _lexer);
Console.WriteLine(string.Join('-', terms.ToTokens()));
Console.WriteLine(string.Join('-', terms.ToUniqueTokens()));
Console.WriteLine(string.Join('-', terms.ToUniqueTokensWithoutStopWords()));
Console.WriteLine(string.Join('\n', terms.Freguency().Select(pair => $"{pair.Key}: {pair.Value}")));
IDF idf = new IDF();
IDocumentReader reader = new JsonByLineReader(source, _lexer);
foreach (var batch in reader.ReadBatches(1000))
{
foreach (var doc in batch)
{
idf.Learn(doc);
}
}
foreach (var batch in reader.ReadBatches(1000))
{
foreach (var doc in batch)
{
var tfidf = TFIDF.TfIdf(doc, idf);
Console.WriteLine(String.Join(" ", tfidf.OrderByDescending(p => p.Value).Take(10).Select(p => p.Key)));
Console.WriteLine();
Console.WriteLine(" ***");
Console.WriteLine();
Thread.Sleep(1000);
}
}
/*

@ -1,11 +1,10 @@
using System;
using System.Collections.Generic;
using System.Collections.Generic;
using ZeroLevel.Services.Semantic.Helpers;
namespace TFIDFbee.Reader
{
public interface IDocumentReader
{
IEnumerable<string[][]> ReadBatches(int size);
public IEnumerable<IEnumerable<Tuple<string, string>>> ReadRawDocumentBatches(int size);
IEnumerable<IEnumerable<BagOfTerms>> ReadBatches(int size);
}
}

@ -2,6 +2,8 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Semantic.Helpers;
namespace TFIDFbee.Reader
{
@ -9,78 +11,18 @@ namespace TFIDFbee.Reader
: IDocumentReader
{
private readonly string _file;
private readonly Func<string, IEnumerable<string>> _lexer;
private readonly ILexProvider _lexer;
public JsonByLineReader(string file, Func<string, IEnumerable<string>> lexer)
public JsonByLineReader(string file, ILexProvider lexer)
{
_file = file;
_lexer = lexer;
}
public IEnumerable<string[][]> ReadBatches(int size)
{
var list = new List<string[]>();
foreach (var batch in ReadDocumentBatches(size))
{
yield return batch.ToArray();
list.Clear();
}
}
private IEnumerable<IEnumerable<string[]>> ReadDocumentBatches(int size)
{
string line;
var batch = new List<string[]>();
string title = null;
string text = null;
using (StreamReader reader = new StreamReader(_file))
{
while ((line = reader.ReadLine()) != null)
{
var titleIndex = line.IndexOf("\"metaTitle\":");
if (titleIndex >= 0)
{
var start = line.IndexOf("\"", titleIndex + 12);
var end = line.LastIndexOf("\"");
if (start < end && start != -1 && end != -1)
{
title = line.Substring(start + 1, end - start - 1);
}
}
else
{
var textIndex = line.IndexOf("\"plaintext\":");
if (textIndex >= 0 && title != null)
{
var start = line.IndexOf("\"", textIndex + 12);
var end = line.LastIndexOf("\"");
if (start < end && start != -1 && end != -1)
{
text = line.Substring(start + 1, end - start - 1);
batch.Add(_lexer(title).Concat(_lexer(text)).ToArray());
if (batch.Count >= size)
{
yield return batch;
batch.Clear();
GC.Collect(2);
}
title = null;
text = null;
}
}
}
}
}
if (batch.Count > 0)
{
yield return batch;
}
}
public IEnumerable<IEnumerable<Tuple<string, string>>> ReadRawDocumentBatches(int size)
public IEnumerable<IEnumerable<BagOfTerms>> ReadBatches(int size)
{
string line;
var batch = new List<Tuple<string, string>>();
var batch = new List<BagOfTerms>();
string title = null;
string text = null;
using (StreamReader reader = new StreamReader(_file))
@ -107,7 +49,7 @@ namespace TFIDFbee.Reader
if (start < end && start != -1 && end != -1)
{
text = line.Substring(start + 1, end - start - 1);
batch.Add(Tuple.Create(title, text));
batch.Add(new BagOfTerms(title + " " + text, _lexer));
if (batch.Count >= size)
{
yield return batch;

@ -1,7 +1,7 @@
using System;
using System.Collections.Generic;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Semantic.Helpers;
namespace TFIDFbee.Reader
{
@ -9,9 +9,9 @@ namespace TFIDFbee.Reader
: IDocumentReader
{
private readonly string _file;
private readonly Func<string, IEnumerable<string>> _lexer;
private readonly ILexProvider _lexer;
public StateMachineReader(string file, Func<string, IEnumerable<string>> lexer)
public StateMachineReader(string file, ILexProvider lexer)
{
_file = file;
_lexer = lexer;
@ -46,30 +46,12 @@ namespace TFIDFbee.Reader
}
}
public IEnumerable<string[][]> ReadBatches(int size)
public IEnumerable<IEnumerable<BagOfTerms>> ReadBatches(int size)
{
var list = new List<string[]>();
var list = new List<BagOfTerms>();
foreach (var record in Parse())
{
list.Add((_lexer(record[0]).Concat(_lexer(record[1])).ToArray()));
if (list.Count > size)
{
yield return list.ToArray();
list.Clear();
}
}
if (list.Count > 0)
{
yield return list.ToArray();
}
}
public IEnumerable<IEnumerable<Tuple<string, string>>> ReadRawDocumentBatches(int size)
{
var list = new List<Tuple<string, string>>();
foreach (var record in Parse())
{
list.Add(Tuple.Create(record[0], record[1]));
list.Add(new BagOfTerms(record[0] + " " + record[1], _lexer));
if (list.Count > size)
{
yield return list.ToArray();

Loading…
Cancel
Save

Powered by TurnKey Linux.