fix keyword extraction

pull/1/head
Ogoun 5 years ago
parent bcb30bc693
commit cc54d87d87

@ -2,31 +2,81 @@
using System; using System;
using System.Collections.Concurrent; using System.Collections.Concurrent;
using System.Collections.Generic; using System.Collections.Generic;
using System.IO;
using System.Linq; using System.Linq;
using System.Threading;
using TFIDFbee.Reader; using TFIDFbee.Reader;
using ZeroLevel;
using ZeroLevel.Services.Semantic; using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Semantic.Helpers; using ZeroLevel.Services.Semantic.Helpers;
using ZeroLevel.Services.Serialization;
namespace TFIDFbee namespace TFIDFbee
{ {
public class IDF
{
private ConcurrentDictionary<string, int> _terms =
new ConcurrentDictionary<string, int>();
private long _documents_count = 0;
public void Learn(BagOfTerms bag)
{
_documents_count++;
foreach (var term in bag.ToUniqueTokens())
{
_terms.AddOrUpdate(term, 1, (w, o) => o + 1);
}
}
public double Idf(string term)
{
if (_terms.ContainsKey(term))
{
double count_documents_with_term = (double)_terms[term];
double total_documents = (double)_documents_count;
return Math.Log(1.0d + (total_documents / count_documents_with_term));
}
return 0.0d;
}
}
public static class TFIDF
{
public static IDictionary<string, double> TfIdf(BagOfTerms document, IDF idf)
{
var freg = document.Freguency();
return document
.ToUniqueTokensWithoutStopWords()
.ToDictionary(t => t, t => idf.Idf(t) * (double)freg[t] / (double)document.Words.Length);
}
}
class Program class Program
{ {
private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204_limit_1000.json"; private const string source = @"D:\Desktop\lenta-ru-data-set_19990901_20171204_limit_1000.json";
private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer()); private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer());
private readonly static ConcurrentDictionary<string, double> _scoring = new ConcurrentDictionary<string, double>(); private readonly static ConcurrentDictionary<string, double> _scoring = new ConcurrentDictionary<string, double>();
static void Main(string[] args) static void Main(string[] args)
{ {
var terms = new BagOfTerms("На практике эти расширения используют нечасто, особенно те расширения, которые для расчёта", _lexer); IDF idf = new IDF();
IDocumentReader reader = new JsonByLineReader(source, _lexer);
Console.WriteLine(string.Join('-', terms.ToTokens())); foreach (var batch in reader.ReadBatches(1000))
Console.WriteLine(string.Join('-', terms.ToUniqueTokens())); {
Console.WriteLine(string.Join('-', terms.ToUniqueTokensWithoutStopWords())); foreach (var doc in batch)
Console.WriteLine(string.Join('\n', terms.Freguency().Select(pair => $"{pair.Key}: {pair.Value}"))); {
idf.Learn(doc);
}
}
foreach (var batch in reader.ReadBatches(1000))
{
foreach (var doc in batch)
{
var tfidf = TFIDF.TfIdf(doc, idf);
Console.WriteLine(String.Join(" ", tfidf.OrderByDescending(p => p.Value).Take(10).Select(p => p.Key)));
Console.WriteLine();
Console.WriteLine(" ***");
Console.WriteLine();
Thread.Sleep(1000);
}
}
/* /*

@ -1,11 +1,10 @@
using System; using System.Collections.Generic;
using System.Collections.Generic; using ZeroLevel.Services.Semantic.Helpers;
namespace TFIDFbee.Reader namespace TFIDFbee.Reader
{ {
public interface IDocumentReader public interface IDocumentReader
{ {
IEnumerable<string[][]> ReadBatches(int size); IEnumerable<IEnumerable<BagOfTerms>> ReadBatches(int size);
public IEnumerable<IEnumerable<Tuple<string, string>>> ReadRawDocumentBatches(int size);
} }
} }

@ -2,6 +2,8 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Semantic.Helpers;
namespace TFIDFbee.Reader namespace TFIDFbee.Reader
{ {
@ -9,78 +11,18 @@ namespace TFIDFbee.Reader
: IDocumentReader : IDocumentReader
{ {
private readonly string _file; private readonly string _file;
private readonly Func<string, IEnumerable<string>> _lexer; private readonly ILexProvider _lexer;
public JsonByLineReader(string file, Func<string, IEnumerable<string>> lexer) public JsonByLineReader(string file, ILexProvider lexer)
{ {
_file = file; _file = file;
_lexer = lexer; _lexer = lexer;
} }
public IEnumerable<string[][]> ReadBatches(int size) public IEnumerable<IEnumerable<BagOfTerms>> ReadBatches(int size)
{
var list = new List<string[]>();
foreach (var batch in ReadDocumentBatches(size))
{
yield return batch.ToArray();
list.Clear();
}
}
private IEnumerable<IEnumerable<string[]>> ReadDocumentBatches(int size)
{
string line;
var batch = new List<string[]>();
string title = null;
string text = null;
using (StreamReader reader = new StreamReader(_file))
{
while ((line = reader.ReadLine()) != null)
{
var titleIndex = line.IndexOf("\"metaTitle\":");
if (titleIndex >= 0)
{
var start = line.IndexOf("\"", titleIndex + 12);
var end = line.LastIndexOf("\"");
if (start < end && start != -1 && end != -1)
{
title = line.Substring(start + 1, end - start - 1);
}
}
else
{
var textIndex = line.IndexOf("\"plaintext\":");
if (textIndex >= 0 && title != null)
{
var start = line.IndexOf("\"", textIndex + 12);
var end = line.LastIndexOf("\"");
if (start < end && start != -1 && end != -1)
{
text = line.Substring(start + 1, end - start - 1);
batch.Add(_lexer(title).Concat(_lexer(text)).ToArray());
if (batch.Count >= size)
{
yield return batch;
batch.Clear();
GC.Collect(2);
}
title = null;
text = null;
}
}
}
}
}
if (batch.Count > 0)
{
yield return batch;
}
}
public IEnumerable<IEnumerable<Tuple<string, string>>> ReadRawDocumentBatches(int size)
{ {
string line; string line;
var batch = new List<Tuple<string, string>>(); var batch = new List<BagOfTerms>();
string title = null; string title = null;
string text = null; string text = null;
using (StreamReader reader = new StreamReader(_file)) using (StreamReader reader = new StreamReader(_file))
@ -107,7 +49,7 @@ namespace TFIDFbee.Reader
if (start < end && start != -1 && end != -1) if (start < end && start != -1 && end != -1)
{ {
text = line.Substring(start + 1, end - start - 1); text = line.Substring(start + 1, end - start - 1);
batch.Add(Tuple.Create(title, text)); batch.Add(new BagOfTerms(title + " " + text, _lexer));
if (batch.Count >= size) if (batch.Count >= size)
{ {
yield return batch; yield return batch;

@ -1,7 +1,7 @@
using System; using System.Collections.Generic;
using System.Collections.Generic;
using System.IO; using System.IO;
using System.Linq; using ZeroLevel.Services.Semantic;
using ZeroLevel.Services.Semantic.Helpers;
namespace TFIDFbee.Reader namespace TFIDFbee.Reader
{ {
@ -9,9 +9,9 @@ namespace TFIDFbee.Reader
: IDocumentReader : IDocumentReader
{ {
private readonly string _file; private readonly string _file;
private readonly Func<string, IEnumerable<string>> _lexer; private readonly ILexProvider _lexer;
public StateMachineReader(string file, Func<string, IEnumerable<string>> lexer) public StateMachineReader(string file, ILexProvider lexer)
{ {
_file = file; _file = file;
_lexer = lexer; _lexer = lexer;
@ -46,30 +46,12 @@ namespace TFIDFbee.Reader
} }
} }
public IEnumerable<string[][]> ReadBatches(int size) public IEnumerable<IEnumerable<BagOfTerms>> ReadBatches(int size)
{ {
var list = new List<string[]>(); var list = new List<BagOfTerms>();
foreach (var record in Parse()) foreach (var record in Parse())
{ {
list.Add((_lexer(record[0]).Concat(_lexer(record[1])).ToArray())); list.Add(new BagOfTerms(record[0] + " " + record[1], _lexer));
if (list.Count > size)
{
yield return list.ToArray();
list.Clear();
}
}
if (list.Count > 0)
{
yield return list.ToArray();
}
}
public IEnumerable<IEnumerable<Tuple<string, string>>> ReadRawDocumentBatches(int size)
{
var list = new List<Tuple<string, string>>();
foreach (var record in Parse())
{
list.Add(Tuple.Create(record[0], record[1]));
if (list.Count > size) if (list.Count > size)
{ {
yield return list.ToArray(); yield return list.ToArray();

Loading…
Cancel
Save

Powered by TurnKey Linux.