From cc54d87d870c82dda0a54c22711e52e1d1a97c2b Mon Sep 17 00:00:00 2001 From: Ogoun Date: Mon, 20 Jan 2020 21:33:13 +0300 Subject: [PATCH] fix keyword extraction --- TFIDFbee/TFIDFbee/Program.cs | 72 ++++++++++++++++--- TFIDFbee/TFIDFbee/Reader/IDocumentReader.cs | 7 +- TFIDFbee/TFIDFbee/Reader/JsonByLineReader.cs | 72 ++----------------- .../TFIDFbee/Reader/StateMachineReader.cs | 34 +++------ 4 files changed, 79 insertions(+), 106 deletions(-) diff --git a/TFIDFbee/TFIDFbee/Program.cs b/TFIDFbee/TFIDFbee/Program.cs index a140eed..9939ef6 100644 --- a/TFIDFbee/TFIDFbee/Program.cs +++ b/TFIDFbee/TFIDFbee/Program.cs @@ -2,32 +2,82 @@ using System; using System.Collections.Concurrent; using System.Collections.Generic; -using System.IO; using System.Linq; +using System.Threading; using TFIDFbee.Reader; -using ZeroLevel; using ZeroLevel.Services.Semantic; using ZeroLevel.Services.Semantic.Helpers; -using ZeroLevel.Services.Serialization; namespace TFIDFbee { + public class IDF + { + private ConcurrentDictionary _terms = + new ConcurrentDictionary(); + private long _documents_count = 0; + + public void Learn(BagOfTerms bag) + { + _documents_count++; + foreach (var term in bag.ToUniqueTokens()) + { + _terms.AddOrUpdate(term, 1, (w, o) => o + 1); + } + } + + public double Idf(string term) + { + if (_terms.ContainsKey(term)) + { + double count_documents_with_term = (double)_terms[term]; + double total_documents = (double)_documents_count; + return Math.Log(1.0d + (total_documents / count_documents_with_term)); + } + return 0.0d; + } + } + + public static class TFIDF + { + public static IDictionary TfIdf(BagOfTerms document, IDF idf) + { + var freg = document.Freguency(); + return document + .ToUniqueTokensWithoutStopWords() + .ToDictionary(t => t, t => idf.Idf(t) * (double)freg[t] / (double)document.Words.Length); + } + } + class Program { - private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204_limit_1000.json"; + private const string source = @"D:\Desktop\lenta-ru-data-set_19990901_20171204_limit_1000.json"; private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer()); private readonly static ConcurrentDictionary _scoring = new ConcurrentDictionary(); static void Main(string[] args) { - var terms = new BagOfTerms("На практике эти расширения используют нечасто, особенно те расширения, которые для расчёта", _lexer); - - Console.WriteLine(string.Join('-', terms.ToTokens())); - Console.WriteLine(string.Join('-', terms.ToUniqueTokens())); - Console.WriteLine(string.Join('-', terms.ToUniqueTokensWithoutStopWords())); - Console.WriteLine(string.Join('\n', terms.Freguency().Select(pair => $"{pair.Key}: {pair.Value}"))); + IDF idf = new IDF(); + IDocumentReader reader = new JsonByLineReader(source, _lexer); + foreach (var batch in reader.ReadBatches(1000)) + { + foreach (var doc in batch) + { + idf.Learn(doc); + } + } + foreach (var batch in reader.ReadBatches(1000)) + { + foreach (var doc in batch) + { + var tfidf = TFIDF.TfIdf(doc, idf); + Console.WriteLine(String.Join(" ", tfidf.OrderByDescending(p => p.Value).Take(10).Select(p => p.Key))); + Console.WriteLine(); + Console.WriteLine(" ***"); + Console.WriteLine(); + Thread.Sleep(1000); + } + } - /* Log.AddConsoleLogger(ZeroLevel.Logging.LogLevel.FullDebug); diff --git a/TFIDFbee/TFIDFbee/Reader/IDocumentReader.cs b/TFIDFbee/TFIDFbee/Reader/IDocumentReader.cs index cb8f587..1ea2a21 100644 --- a/TFIDFbee/TFIDFbee/Reader/IDocumentReader.cs +++ b/TFIDFbee/TFIDFbee/Reader/IDocumentReader.cs @@ -1,11 +1,10 @@ -using System; -using System.Collections.Generic; +using System.Collections.Generic; +using ZeroLevel.Services.Semantic.Helpers; namespace TFIDFbee.Reader { public interface IDocumentReader { - IEnumerable ReadBatches(int size); - public IEnumerable>> ReadRawDocumentBatches(int size); + IEnumerable> ReadBatches(int size); } } diff --git a/TFIDFbee/TFIDFbee/Reader/JsonByLineReader.cs b/TFIDFbee/TFIDFbee/Reader/JsonByLineReader.cs index de89910..aab8d70 100644 --- a/TFIDFbee/TFIDFbee/Reader/JsonByLineReader.cs +++ b/TFIDFbee/TFIDFbee/Reader/JsonByLineReader.cs @@ -2,6 +2,8 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using ZeroLevel.Services.Semantic; +using ZeroLevel.Services.Semantic.Helpers; namespace TFIDFbee.Reader { @@ -9,78 +11,18 @@ namespace TFIDFbee.Reader : IDocumentReader { private readonly string _file; - private readonly Func> _lexer; + private readonly ILexProvider _lexer; - public JsonByLineReader(string file, Func> lexer) + public JsonByLineReader(string file, ILexProvider lexer) { _file = file; _lexer = lexer; } - public IEnumerable ReadBatches(int size) - { - var list = new List(); - foreach (var batch in ReadDocumentBatches(size)) - { - yield return batch.ToArray(); - list.Clear(); - } - } - - private IEnumerable> ReadDocumentBatches(int size) - { - string line; - var batch = new List(); - string title = null; - string text = null; - using (StreamReader reader = new StreamReader(_file)) - { - while ((line = reader.ReadLine()) != null) - { - var titleIndex = line.IndexOf("\"metaTitle\":"); - if (titleIndex >= 0) - { - var start = line.IndexOf("\"", titleIndex + 12); - var end = line.LastIndexOf("\""); - if (start < end && start != -1 && end != -1) - { - title = line.Substring(start + 1, end - start - 1); - } - } - else - { - var textIndex = line.IndexOf("\"plaintext\":"); - if (textIndex >= 0 && title != null) - { - var start = line.IndexOf("\"", textIndex + 12); - var end = line.LastIndexOf("\""); - if (start < end && start != -1 && end != -1) - { - text = line.Substring(start + 1, end - start - 1); - batch.Add(_lexer(title).Concat(_lexer(text)).ToArray()); - if (batch.Count >= size) - { - yield return batch; - batch.Clear(); - GC.Collect(2); - } - title = null; - text = null; - } - } - } - } - } - if (batch.Count > 0) - { - yield return batch; - } - } - - public IEnumerable>> ReadRawDocumentBatches(int size) + public IEnumerable> ReadBatches(int size) { string line; - var batch = new List>(); + var batch = new List(); string title = null; string text = null; using (StreamReader reader = new StreamReader(_file)) @@ -107,7 +49,7 @@ namespace TFIDFbee.Reader if (start < end && start != -1 && end != -1) { text = line.Substring(start + 1, end - start - 1); - batch.Add(Tuple.Create(title, text)); + batch.Add(new BagOfTerms(title + " " + text, _lexer)); if (batch.Count >= size) { yield return batch; diff --git a/TFIDFbee/TFIDFbee/Reader/StateMachineReader.cs b/TFIDFbee/TFIDFbee/Reader/StateMachineReader.cs index c7385ed..8916a87 100644 --- a/TFIDFbee/TFIDFbee/Reader/StateMachineReader.cs +++ b/TFIDFbee/TFIDFbee/Reader/StateMachineReader.cs @@ -1,7 +1,7 @@ -using System; -using System.Collections.Generic; +using System.Collections.Generic; using System.IO; -using System.Linq; +using ZeroLevel.Services.Semantic; +using ZeroLevel.Services.Semantic.Helpers; namespace TFIDFbee.Reader { @@ -9,9 +9,9 @@ namespace TFIDFbee.Reader : IDocumentReader { private readonly string _file; - private readonly Func> _lexer; + private readonly ILexProvider _lexer; - public StateMachineReader(string file, Func> lexer) + public StateMachineReader(string file, ILexProvider lexer) { _file = file; _lexer = lexer; @@ -46,30 +46,12 @@ namespace TFIDFbee.Reader } } - public IEnumerable ReadBatches(int size) + public IEnumerable> ReadBatches(int size) { - var list = new List(); + var list = new List(); foreach (var record in Parse()) { - list.Add((_lexer(record[0]).Concat(_lexer(record[1])).ToArray())); - if (list.Count > size) - { - yield return list.ToArray(); - list.Clear(); - } - } - if (list.Count > 0) - { - yield return list.ToArray(); - } - } - - public IEnumerable>> ReadRawDocumentBatches(int size) - { - var list = new List>(); - foreach (var record in Parse()) - { - list.Add(Tuple.Create(record[0], record[1])); + list.Add(new BagOfTerms(record[0] + " " + record[1], _lexer)); if (list.Count > size) { yield return list.ToArray();