From 36a429b45b6a8873ec79a3418607caabe779f446 Mon Sep 17 00:00:00 2001 From: Ogoun Date: Wed, 22 Jan 2020 21:36:32 +0300 Subject: [PATCH] upd --- TFIDFbee/TFIDFbee.sln | 10 +++ TFIDFbee/TFIDFbee/Program.cs | 41 +-------- TFIDFbee/Tests/Program.cs | 57 ++++++++++++ TFIDFbee/Tests/Tests.csproj | 12 +++ .../Services/Semantic/Helpers/BagOfTerms.cs | 13 ++- .../Services/Semantic/Helpers/StopWords.cs | 6 ++ ZeroLevel/Services/Semantic/TFIDF.cs | 90 +++++++++++++++++++ ZeroLevel/Services/Utils/Multiprocessor.cs | 60 +++++++++++++ 8 files changed, 242 insertions(+), 47 deletions(-) create mode 100644 TFIDFbee/Tests/Program.cs create mode 100644 TFIDFbee/Tests/Tests.csproj create mode 100644 ZeroLevel/Services/Semantic/TFIDF.cs create mode 100644 ZeroLevel/Services/Utils/Multiprocessor.cs diff --git a/TFIDFbee/TFIDFbee.sln b/TFIDFbee/TFIDFbee.sln index c7467e9..23c1ec8 100644 --- a/TFIDFbee/TFIDFbee.sln +++ b/TFIDFbee/TFIDFbee.sln @@ -9,6 +9,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Lemmatization", "..\Lemmati EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ZeroLevel", "..\ZeroLevel\ZeroLevel.csproj", "{6AF46F95-EA67-4258-96B1-7BBC57EB965D}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tests", "Tests\Tests.csproj", "{41061774-D2A1-4291-8909-62E4A63B03B4}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -41,6 +43,14 @@ Global {6AF46F95-EA67-4258-96B1-7BBC57EB965D}.Release|Any CPU.Build.0 = Release|Any CPU {6AF46F95-EA67-4258-96B1-7BBC57EB965D}.Release|x64.ActiveCfg = Release|x64 {6AF46F95-EA67-4258-96B1-7BBC57EB965D}.Release|x64.Build.0 = Release|x64 + {41061774-D2A1-4291-8909-62E4A63B03B4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {41061774-D2A1-4291-8909-62E4A63B03B4}.Debug|Any CPU.Build.0 = Debug|Any CPU + {41061774-D2A1-4291-8909-62E4A63B03B4}.Debug|x64.ActiveCfg = Debug|Any CPU + {41061774-D2A1-4291-8909-62E4A63B03B4}.Debug|x64.Build.0 = Debug|Any CPU + {41061774-D2A1-4291-8909-62E4A63B03B4}.Release|Any CPU.ActiveCfg = Release|Any CPU + {41061774-D2A1-4291-8909-62E4A63B03B4}.Release|Any CPU.Build.0 = Release|Any CPU + {41061774-D2A1-4291-8909-62E4A63B03B4}.Release|x64.ActiveCfg = Release|Any CPU + {41061774-D2A1-4291-8909-62E4A63B03B4}.Release|x64.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/TFIDFbee/TFIDFbee/Program.cs b/TFIDFbee/TFIDFbee/Program.cs index 9939ef6..ef8e6be 100644 --- a/TFIDFbee/TFIDFbee/Program.cs +++ b/TFIDFbee/TFIDFbee/Program.cs @@ -6,48 +6,9 @@ using System.Linq; using System.Threading; using TFIDFbee.Reader; using ZeroLevel.Services.Semantic; -using ZeroLevel.Services.Semantic.Helpers; namespace TFIDFbee { - public class IDF - { - private ConcurrentDictionary _terms = - new ConcurrentDictionary(); - private long _documents_count = 0; - - public void Learn(BagOfTerms bag) - { - _documents_count++; - foreach (var term in bag.ToUniqueTokens()) - { - _terms.AddOrUpdate(term, 1, (w, o) => o + 1); - } - } - - public double Idf(string term) - { - if (_terms.ContainsKey(term)) - { - double count_documents_with_term = (double)_terms[term]; - double total_documents = (double)_documents_count; - return Math.Log(1.0d + (total_documents / count_documents_with_term)); - } - return 0.0d; - } - } - - public static class TFIDF - { - public static IDictionary TfIdf(BagOfTerms document, IDF idf) - { - var freg = document.Freguency(); - return document - .ToUniqueTokensWithoutStopWords() - .ToDictionary(t => t, t => idf.Idf(t) * (double)freg[t] / (double)document.Words.Length); - } - } - class Program { private const string source = @"D:\Desktop\lenta-ru-data-set_19990901_20171204_limit_1000.json"; @@ -62,7 +23,7 @@ namespace TFIDFbee { foreach (var doc in batch) { - idf.Learn(doc); + idf.Append(doc); } } foreach (var batch in reader.ReadBatches(1000)) diff --git a/TFIDFbee/Tests/Program.cs b/TFIDFbee/Tests/Program.cs new file mode 100644 index 0000000..fc54845 --- /dev/null +++ b/TFIDFbee/Tests/Program.cs @@ -0,0 +1,57 @@ +using System; +using System.Collections.Generic; +using System.Net; +using System.Net.Http; +using ZeroLevel.Services.Web; + +namespace Tests +{ + class Program + { + public String responseToWords(String response) + { + response = response + .ToLowerInvariant() + .Replace("", ",") + .Replace("", ",") + .Replace("(); + foreach (String word in response.Split(",")) + { + if (!string.IsNullOrWhiteSpace(word) && word.Length > 1) + { + array.Add(word); + } + } + array.Sort(); + response = string.Join(' ', array); + return response; + } + + private HttpClient GetClient() + { + var handler = new HttpClientHandler + { + AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, + UseDefaultCredentials = ZeroLevel.Configuration.Default.FirstOrDefault("useDefaultCredentianls"), + ServerCertificateCustomValidationCallback = (sender, cert, chain, sslPolicyErrors) => { return true; } + }; + if (ZeroLevel.Configuration.Default.FirstOrDefault("useDefaultCredentianls")) + { + handler.DefaultProxyCredentials = CredentialCache.DefaultCredentials; + } + var httpClient = new HttpClient(handler); + httpClient.DefaultRequestHeaders.Add("user-agent", UserAgents.Next()); + return httpClient; + } + + static void Main(string[] args) + { + Console.WriteLine("Hello World!"); + } + } +} diff --git a/TFIDFbee/Tests/Tests.csproj b/TFIDFbee/Tests/Tests.csproj new file mode 100644 index 0000000..f90f0c0 --- /dev/null +++ b/TFIDFbee/Tests/Tests.csproj @@ -0,0 +1,12 @@ + + + + Exe + netcoreapp3.1 + + + + + + + diff --git a/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs b/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs index 0e7cbc5..7b17026 100644 --- a/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs +++ b/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs @@ -1,5 +1,4 @@ -using Iveonik.Stemmers; -using System; +using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; @@ -15,21 +14,21 @@ namespace ZeroLevel.Services.Semantic.Helpers private string[] _words; private ILexProvider _lexer; - public BagOfTerms(string text) : this(TextAnalizer.ExtractWords(text).ToArray(), new LexProvider(new RussianStemmer())) { } + public BagOfTerms(string text) : this(TextAnalizer.ExtractWords(text).ToArray(), null) { } public BagOfTerms(string text, ILexProvider lexer) : this(TextAnalizer.ExtractWords(text).ToArray(), lexer) { } - public BagOfTerms(IEnumerable words) : this(words.ToArray(), new LexProvider(new RussianStemmer())) { } + public BagOfTerms(IEnumerable words) : this(words.ToArray(), null) { } public BagOfTerms(IEnumerable words, ILexProvider lexer) : this(words.ToArray(), lexer) { } - public BagOfTerms(string[] words) : this(words, new LexProvider(new RussianStemmer())) { } + public BagOfTerms(string[] words) : this(words, null) { } public BagOfTerms(string[] words, ILexProvider lexer) - { + { _lexer = lexer; _frequency = null; - _words = _lexer.ExtractLexTokens(words).Select(t => t.Token).ToArray(); + _words = _lexer == null ? words : _lexer.ExtractLexTokens(words).Select(t => t.Token).ToArray(); } public string[] Words => _words; diff --git a/ZeroLevel/Services/Semantic/Helpers/StopWords.cs b/ZeroLevel/Services/Semantic/Helpers/StopWords.cs index ea5a700..157edb5 100644 --- a/ZeroLevel/Services/Semantic/Helpers/StopWords.cs +++ b/ZeroLevel/Services/Semantic/Helpers/StopWords.cs @@ -5,10 +5,16 @@ namespace ZeroLevel.Implementation.Semantic.Helpers public static class StopWords { private readonly static HashSet _stop_words = new HashSet { "a", "about", "all", "am", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "can", "could", "do", "for", "from", "has", "have", "i", "if", "in", "is", "it", "me", "my", "no", "not", "of", "on", "one", "or", "so", "that", "the", "them", "there", "they", "this", "to", "was", "we", "what", "which", "will", "with", "would", "you", "а", "будем", "будет", "будете", "будешь", "буду", "будут", "будучи", "будь", "будьте", "бы", "был", "была", "были", "было", "быть", "в", "вам", "вами", "вас", "весь", "во", "вот", "все", "всё", "всего", "всей", "всем", "всём", "всеми", "всему", "всех", "всею", "всея", "всю", "вся", "вы", "да", "для", "до", "его", "едим", "едят", "ее", "её", "ей", "ел", "ела", "ем", "ему", "емъ", "если", "ест", "есть", "ешь", "еще", "ещё", "ею", "же", "за", "и", "из", "или", "им", "ими", "имъ", "их", "к", "как", "кем", "ко", "когда", "кого", "ком", "кому", "комья", "которая", "которого", "которое", "которой", "котором", "которому", "которою", "которую", "которые", "который", "которым", "которыми", "которых", "кто", "меня", "мне", "мной", "мною", "мог", "моги", "могите", "могла", "могли", "могло", "могу", "могут", "мое", "моё", "моего", "моей", "моем", "моём", "моему", "моею", "можем", "может", "можете", "можешь", "мои", "мой", "моим", "моими", "моих", "мочь", "мою", "моя", "мы", "на", "нам", "нами", "нас", "наса", "наш", "наша", "наше", "нашего", "нашей", "нашем", "нашему", "нашею", "наши", "нашим", "нашими", "наших", "нашу", "не", "него", "нее", "неё", "ней", "нем", "нём", "нему", "нет", "нею", "ним", "ними", "них", "но", "о", "об", "один", "одна", "одни", "одним", "одними", "одних", "одно", "одного", "одной", "одном", "одному", "одною", "одну", "он", "она", "оне", "они", "оно", "от", "по", "при", "с", "сам", "сама", "сами", "самим", "самими", "самих", "само", "самого", "самом", "самому", "саму", "свое", "своё", "своего", "своей", "своем", "своём", "своему", "своею", "свои", "свой", "своим", "своими", "своих", "свою", "своя", "себе", "себя", "собой", "собою", "та", "так", "такая", "такие", "таким", "такими", "таких", "такого", "такое", "такой", "таком", "такому", "такою", "такую", "те", "тебе", "тебя", "тем", "теми", "тех", "то", "тобой", "тобою", "того", "той", "только", "том", "томах", "тому", "тот", "тою", "ту", "ты", "у", "уже", "чего", "чем", "чём", "чему", "что", "чтобы", "эта", "эти", "этим", "этими", "этих", "это", "этого", "этой", "этом", "этому", "этот", "этою", "эту", "я", "ещë", "еë", "моë", "моëм", "всë", "кто-то ", "что-то", "мені", "наші", "нашої", "нашій", "нашою", "нашім", "ті", "тієї", "тією", "тії", "теє" }; + private readonly static HashSet _html_tags = new HashSet { "doctype", "a", "accesskey", "charset", "coords", "download", "href", "hreflang", "name", "rel", "rev", "shape", "tabindex", "target", "title", "type", "abbr", "title", "acronym", "address", "applet", "align", "alt", "archive", "code", "codebase", "height", "hspace", "vspace", "width", "area", "accesskey", "alt", "coords", "href", "hreflang", "nohref", "shape", "tabindex", "target", "type", "article", "aside", "audio", "autoplay", "controls", "loop", "muted", "preload", "src", "b", "base", "href", "target", "basefont", "color", "face", "size", "bdi", "bdo", "dir", "bgsound", "balance", "loop", "src", "volume", "big", "blink", "blockquote", "body", "alink", "background", "bgcolor", "bgproperties", "bottommargin", "leftmargin", "link", "rightmargin", "scroll", "text", "topmargin", "vlink", "br", "clear", "button", "accesskey", "autofocus", "disabled", "form", "formaction", "formenctype", "formmethod", "formnovalidate", "formtarget", "name", "type", "value", "canvas", "caption", "align", "valign", "center", "cite", "code", "col", "align", "char", "charoff", "span", "valign", "width", "colgroup", "align", "char", "charoff", "span", "valign", "width", "command", "comment", "datalist", "dd", "del", "cite", "datetime", "details", "dfn", "dir", "div", "align", "title", "dl", "dt", "em", "embed", "align", "height", "hidden", "hspace", "pluginspage", "src", "type", "vspace", "width", "fieldset", "disabled", "form", "title", "figcaption", "figure", "font", "color", "face", "size", "footer", "form", "accept-charset", "action", "autocomplete", "enctype", "method", "name", "novalidate", "target", "frame", "bordercolor", "frameborder", "name", "noresize", "scrolling", "src", "frameset", "border", "bordercolor", "cols", "frameborder", "framespacing", "rows", "h1", "align", "h2", "align", "h3", "align", "h4", "align", "h5", "align", "h6", "align", "head", "profile", "header", "hgroup", "hr", "align", "color", "noshade", "size", "width", "html", "manifest", "title", "xmlns", "i", "iframe", "align", "allowtransparency", "frameborder", "height", "hspace", "marginheight", "marginwidth", "name", "sandbox", "scrolling", "seamless", "src", "srcdoc", "vspace", "width", "img", "align", "alt", "border", "height", "hspace", "ismap", "longdesc", "lowsrc", "src", "usemap", "vspace", "width", "input", "accept", "accesskey", "align", "alt", "autocomplete", "autofocus", "border", "checked", "disabled", "form", "formaction", "formenctype", "formmethod", "formnovalidate", "formtarget", "list", "max", "maxlength", "min", "multiple", "name", "pattern", "placeholder", "readonly", "required", "size", "src", "step", "tabindex", "type", "value", "ins", "cite", "datetime", "isindex", "kbd", "keygen", "label", "accesskey", "for", "legend", "accesskey", "align", "title", "li", "type", "value", "link", "charset", "href", "media", "rel", "sizes", "type", "listing", "main", "map", "name", "mark", "marquee", "behavior", "bgcolor", "direction", "height", "hspace", "loop", "scrollamount", "scrolldelay", "truespeed", "vspace", "width", "menu", "label", "type", "meta", "charset", "content", "http-equiv", "name", "meter", "high", "low", "max", "min", "optimum", "value", "multicol", "nav", "nobr", "noembed", "noframes", "noscript", "object", "align", "archive", "classid", "code", "codebase", "codetype", "data", "height", "hspace", "tabindex", "type", "vspace", "width", "ol", "reversed", "start", "type", "optgroup", "disabled", "label", "option", "disabled", "label", "selected", "value", "output", "p", "align", "param", "name", "type", "value", "valuetype", "plaintext", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "script", "async", "defer", "language", "src", "type", "section", "select", "accesskey", "autofocus", "disabled", "form", "multiple", "name", "required", "size", "tabindex", "small", "source", "media", "src", "type", "spacer", "span", "strike", "strong", "style", "media", "type", "sub", "summary", "sup", "table", "align", "background", "bgcolor", "border", "bordercolor", "cellpadding", "cellspacing", "cols", "frame", "height", "rules", "summary", "width", "tbody", "align", "bgcolor", "char", "charoff", "valign", "td", "abbr", "align", "axis", "background", "bgcolor", "bordercolor", "char", "charoff", "colspan", "headers", "height", "nowrap", "rowspan", "scope", "valign", "width", "textarea", "accesskey", "autofocus", "cols", "disabled", "form", "maxlength", "name", "placeholder", "readonly", "required", "rows", "tabindex", "wrap", "tfoot", "align", "bgcolor", "char", "charoff", "valign", "th", "abbr", "align", "axis", "background", "bgcolor", "bordercolor", "char", "charoff", "colspan", "headers", "height", "nowrap", "rowspan", "scope", "valign", "width", "thead", "align", "bgcolor", "char", "charoff", "valign", "time", "datetime", "pubdate", "title", "tr", "align", "bgcolor", "bordercolor", "char", "charoff", "valign", "track", "tt", "u", "ul", "type", "var", "video", "autoplay", "controls", "height", "loop", "poster", "preload", "src", "width", "wbr", "xmp" }; public static bool IsStopWord(string word) { return _stop_words.Contains(word.Trim().ToLowerInvariant()); } + + public static bool IsHtmlTag(string word) + { + return _html_tags.Contains(word.Trim().ToLowerInvariant()); + } } } \ No newline at end of file diff --git a/ZeroLevel/Services/Semantic/TFIDF.cs b/ZeroLevel/Services/Semantic/TFIDF.cs new file mode 100644 index 0000000..5ef98e1 --- /dev/null +++ b/ZeroLevel/Services/Semantic/TFIDF.cs @@ -0,0 +1,90 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using ZeroLevel.Services.Semantic.Helpers; + +namespace ZeroLevel.Services.Semantic +{ + public class IDF + { + private ConcurrentDictionary _terms = + new ConcurrentDictionary(); + private long _documents_count = 0; + + public void Append(BagOfTerms bag) + { + _documents_count++; + foreach (var term in bag.ToUniqueTokens()) + { + _terms.AddOrUpdate(term, 1, (w, o) => o + 1); + } + } + + public double Idf(string term) + { + if (_terms.ContainsKey(term)) + { + double count_documents_with_term = (double)_terms[term]; + double total_documents = (double)_documents_count; + return Math.Log(1.0d + (total_documents / count_documents_with_term)); + } + return 0.0d; + } + } + + public static class TFIDF + { + private static readonly IReadOnlyDictionary _empty = new Dictionary(); + + public static IReadOnlyDictionary TfIdf(BagOfTerms document, IDF idf) + { + if (document.Words.Length > 0) + { + var freg = document.Freguency(); + return document + .ToUniqueTokensWithoutStopWords() + .ToDictionary(t => t, t => idf.Idf(t) * (double)freg[t] / (double)document.Words.Length); + } + return _empty; + } + + public static IReadOnlyDictionary TfIdf_Smooth(BagOfTerms document, IDF idf) + { + if (document.Words.Length > 0) + { + var freg = document.Freguency(); + var max = (double)freg.Max(f => f.Value); + return document + .ToUniqueTokensWithoutStopWords() + .ToDictionary(t => t, t => idf.Idf(t) * (0.5d + 0.5d * ((double)freg[t] / max))); + } + return _empty; + } + + public static IReadOnlyDictionary Tf(BagOfTerms document) + { + if (document.Words.Length > 0) + { + var freg = document.Freguency(); + return document + .ToUniqueTokensWithoutStopWords() + .ToDictionary(t => t, t => (double)freg[t] / (double)document.Words.Length); + } + return _empty; + } + + public static IReadOnlyDictionary Tf_Smooth(BagOfTerms document) + { + if (document.Words.Length > 0) + { + var freg = document.Freguency(); + var max = (double)freg.Max(f => f.Value); + return document + .ToUniqueTokensWithoutStopWords() + .ToDictionary(t => t, t => (0.5d + 0.5d * ((double)freg[t] / max))); + } + return _empty; + } + } +} diff --git a/ZeroLevel/Services/Utils/Multiprocessor.cs b/ZeroLevel/Services/Utils/Multiprocessor.cs new file mode 100644 index 0000000..fa2f2ec --- /dev/null +++ b/ZeroLevel/Services/Utils/Multiprocessor.cs @@ -0,0 +1,60 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Threading; + +namespace ZeroLevel.Utils +{ + public class Multiprocessor + : IDisposable + { + private BlockingCollection _queue = new BlockingCollection(); + private List _threads = new List(); + + public Multiprocessor(Action handler, int size, int stackSize = 1024 * 256) + { + for (int i = 0; i < size; i++) + { + var t = new Thread(() => + { + try + { + T item; + while (!_queue.IsCompleted) + { + if (_queue.TryTake(out item, 200)) + { + handler(item); + } + } + } + catch { } + }, stackSize); + t.IsBackground = true; + _threads.Add(t); + } + foreach (var t in _threads) t.Start(); + } + + public void Append(T t) => _queue.Add(t); + + public void WaitForEmpty() + { + while (_queue.Count > 0) + { + Thread.Sleep(100); + } + } + + public void Dispose() + { + _queue.CompleteAdding(); + while (_queue.Count > 0) + { + Thread.Sleep(100); + } + Thread.Sleep(3000); // wait while threads exit + _queue.Dispose(); + } + } +}