From ed1983b71537b84d324c0edac215ea300a839970 Mon Sep 17 00:00:00 2001 From: Ogoun Date: Fri, 17 Jan 2020 18:26:08 +0300 Subject: [PATCH] upds --- TFIDFbee/TFIDFbee.sln | 51 +++ TFIDFbee/TFIDFbee/Document.cs | 8 + TFIDFbee/TFIDFbee/Program.cs | 367 ++++++++++++++++++ TFIDFbee/TFIDFbee/TFIDFbee.csproj | 19 + TestApp/Program.cs | 21 +- TestApp/TestApp.csproj | 2 +- ZeroLevel/Services/Config/Configuration.cs | 4 + ZeroLevel/Services/FileSystem/FSUtils.cs | 2 +- .../Services/Semantic/Helpers/BagOfWords.cs | 96 +++++ .../Services/Semantic/Helpers/SparceVector.cs | 101 +++++ 10 files changed, 649 insertions(+), 22 deletions(-) create mode 100644 TFIDFbee/TFIDFbee.sln create mode 100644 TFIDFbee/TFIDFbee/Document.cs create mode 100644 TFIDFbee/TFIDFbee/Program.cs create mode 100644 TFIDFbee/TFIDFbee/TFIDFbee.csproj create mode 100644 ZeroLevel/Services/Semantic/Helpers/BagOfWords.cs create mode 100644 ZeroLevel/Services/Semantic/Helpers/SparceVector.cs diff --git a/TFIDFbee/TFIDFbee.sln b/TFIDFbee/TFIDFbee.sln new file mode 100644 index 0000000..2eca224 --- /dev/null +++ b/TFIDFbee/TFIDFbee.sln @@ -0,0 +1,51 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29709.97 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TFIDFbee", "TFIDFbee\TFIDFbee.csproj", "{7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Lemmatization", "..\HoKeMs\Lemmatization\Lemmatization.csproj", "{BF9F7C1E-098B-4815-BA35-8A9845C66663}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ZeroLevel", "..\..\GIT\Zero\ZeroLevel\ZeroLevel.csproj", "{5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 + Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Debug|Any CPU.Build.0 = Debug|Any CPU + {7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Debug|x64.ActiveCfg = Debug|x64 + {7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Debug|x64.Build.0 = Debug|x64 + {7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Release|Any CPU.ActiveCfg = Release|Any CPU + {7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Release|Any CPU.Build.0 = Release|Any CPU + {7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Release|x64.ActiveCfg = Release|Any CPU + {7B39E0A1-3DE4-4702-8D61-5C9A6CF164C6}.Release|x64.Build.0 = Release|Any CPU + {BF9F7C1E-098B-4815-BA35-8A9845C66663}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {BF9F7C1E-098B-4815-BA35-8A9845C66663}.Debug|Any CPU.Build.0 = Debug|Any CPU + {BF9F7C1E-098B-4815-BA35-8A9845C66663}.Debug|x64.ActiveCfg = Debug|x64 + {BF9F7C1E-098B-4815-BA35-8A9845C66663}.Debug|x64.Build.0 = Debug|x64 + {BF9F7C1E-098B-4815-BA35-8A9845C66663}.Release|Any CPU.ActiveCfg = Release|Any CPU + {BF9F7C1E-098B-4815-BA35-8A9845C66663}.Release|Any CPU.Build.0 = Release|Any CPU + {BF9F7C1E-098B-4815-BA35-8A9845C66663}.Release|x64.ActiveCfg = Release|Any CPU + {BF9F7C1E-098B-4815-BA35-8A9845C66663}.Release|x64.Build.0 = Release|Any CPU + {5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Debug|x64.ActiveCfg = Debug|x64 + {5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Debug|x64.Build.0 = Debug|x64 + {5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Release|Any CPU.Build.0 = Release|Any CPU + {5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Release|x64.ActiveCfg = Release|x64 + {5FF0C954-7FB8-49F4-9E97-9DCC933D45FF}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {F440F94E-537A-44F4-9103-BD6C7BCAF6E3} + EndGlobalSection +EndGlobal diff --git a/TFIDFbee/TFIDFbee/Document.cs b/TFIDFbee/TFIDFbee/Document.cs new file mode 100644 index 0000000..a1183a9 --- /dev/null +++ b/TFIDFbee/TFIDFbee/Document.cs @@ -0,0 +1,8 @@ +namespace TFIDFbee +{ + public class Document + { + public string Title { get; set; } + public string Text { get; set; } + } +} diff --git a/TFIDFbee/TFIDFbee/Program.cs b/TFIDFbee/TFIDFbee/Program.cs new file mode 100644 index 0000000..61becb0 --- /dev/null +++ b/TFIDFbee/TFIDFbee/Program.cs @@ -0,0 +1,367 @@ +using Accord.MachineLearning; +using Lemmatization; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using ZeroLevel; +using ZeroLevel.Services.Semantic; +using ZeroLevel.Services.Semantic.Helpers; + +namespace TFIDFbee +{ + class Program + { + private const string source = @"D:\Desktop\lenta-ru-data-set_19990901_20171204.json"; + private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer()); + + static void Main(string[] args) + { + Configuration.Save(Configuration.ReadFromApplicationConfig()); + /*var codebook = new TFIDF() + { + Tf = TermFrequency.Log, + Idf = InverseDocumentFrequency.Default, + UpdateDictionary = true + };*/ + var codebook = new ZeroLevel.Services.Semantic.Helpers.BagOfWords(); + foreach (var batch in ParseBatches(1000)) + { + codebook.Learn(batch); + Console.WriteLine($"Documents: {codebook.NumberOfDocuments}"); + Console.WriteLine($"Words: {codebook.NumberOfWords}"); + } + + var vectors = new List(); + foreach (var docs in ReadRawDocumentBatches(1000)) + { + foreach (var doc in docs) + { + var words = _lexer.ExtractLexTokens(doc.Item2).Select(t => t.Token)/*.Concat(_lexer.ExtractLexTokens(doc.Text).Select(t => t.Token))*/.ToArray(); + vectors.Add(codebook.Transform(words)); + } + } + + var list = new List>(); + for (int i = 0; i < vectors.Count; i++) + { + for (int j = i + 1; j < vectors.Count - 1; j++) + { + if (i == j) continue; + var diff = vectors[i].Measure(vectors[j]); + if (diff > double.Epsilon) + { + list.Add(Tuple.Create(diff, i, j)); + } + } + } + + var to_present = list.OrderBy(e => e.Item1).Take(200).ToArray(); + var to_present_map = new Dictionary>(); + foreach (var e in to_present) + { + if (!to_present_map.ContainsKey(e.Item2)) + { + to_present_map.Add(e.Item2, null); + } + if (!to_present_map.ContainsKey(e.Item3)) + { + to_present_map.Add(e.Item3, null); + } + } + + int index = 0; + foreach (var docs in ReadRawDocumentBatches(1000)) + { + foreach (var doc in docs) + { + if (to_present_map.ContainsKey(index)) + { + to_present_map[index] = doc; + } + index++; + } + } + + index = 0; + foreach (var e in to_present) + { + Console.WriteLine($"#{index++}: {e.Item1}"); + Console.WriteLine(to_present_map[e.Item2].Item1); + Console.WriteLine(to_present_map[e.Item3].Item2); + Console.WriteLine("--------------------"); + Console.WriteLine(); + } + + Console.WriteLine("Completed"); + Console.ReadKey(); + } + + private static IEnumerable ExtractLemmas(string text) + { + return + _lexer.ExtractUniqueLexTokensWithoutStopWords(text) + .Select(t => t.Token) + .Where(s => s.Any(c => char.IsLetter(c))); + } + + public static IEnumerable ReadBatches(int size) + { + var list = new List(); + foreach (var batch in ReadDocumentBatches(size)) + { + yield return batch.ToArray(); + list.Clear(); + } + } + + public static IEnumerable> ReadDocumentBatches(int size) + { + string line; + var batch = new List(); + string title = null; + string text = null; + using (StreamReader reader = new StreamReader(source)) + { + while ((line = reader.ReadLine()) != null) + { + var titleIndex = line.IndexOf("\"metaTitle\":"); + if (titleIndex >= 0) + { + var start = line.IndexOf("\"", titleIndex + 12); + var end = line.LastIndexOf("\""); + if (start < end && start != -1 && end != -1) + { + title = line.Substring(start + 1, end - start - 1); + } + } + else + { + var textIndex = line.IndexOf("\"plaintext\":"); + if (textIndex >= 0 && title != null) + { + var start = line.IndexOf("\"", textIndex + 12); + var end = line.LastIndexOf("\""); + if (start < end && start != -1 && end != -1) + { + text = line.Substring(start + 1, end - start - 1); + batch.Add(ExtractLemmas(title).Concat(ExtractLemmas(text)).ToArray()); + if (batch.Count >= size) + { + yield return batch; + batch.Clear(); + GC.Collect(2); + } + title = null; + text = null; + } + } + } + } + } + if (batch.Count > 0) + { + yield return batch; + } + } + + public static IEnumerable>> ReadRawDocumentBatches(int size) + { + string line; + var batch = new List>(); + string title = null; + string text = null; + using (StreamReader reader = new StreamReader(source)) + { + while ((line = reader.ReadLine()) != null) + { + var titleIndex = line.IndexOf("\"metaTitle\":"); + if (titleIndex >= 0) + { + var start = line.IndexOf("\"", titleIndex + 12); + var end = line.LastIndexOf("\""); + if (start < end && start != -1 && end != -1) + { + title = line.Substring(start + 1, end - start - 1); + } + } + else + { + var textIndex = line.IndexOf("\"plaintext\":"); + if (textIndex >= 0 && title != null) + { + var start = line.IndexOf("\"", textIndex + 12); + var end = line.LastIndexOf("\""); + if (start < end && start != -1 && end != -1) + { + text = line.Substring(start + 1, end - start - 1); + batch.Add(Tuple.Create(title, text)); + if (batch.Count >= size) + { + yield return batch; + batch.Clear(); + GC.Collect(2); + } + title = null; + text = null; + } + } + } + } + } + if (batch.Count > 0) + { + yield return batch; + } + } + + private class RecordParser + { + private enum RPState + { + WaitKey, + ParseKey, + WaitKeyConfirm, + WaitValue, + ParseValue + } + private readonly StringBuilder _builder = new StringBuilder(); + private RPState State = RPState.WaitKey; + private char _previous = '\0'; + private string _key; + private string _value; + private readonly Action _callback; + + public RecordParser(Action callback) + { + _callback = callback; + } + + public void Append(string text) + { + foreach (var ch in text) + { + switch (State) + { + case RPState.WaitKey: + if (ch.Equals('"')) + { + State = RPState.ParseKey; + _builder.Clear(); + } + break; + case RPState.ParseKey: + if (ch.Equals('"') && _previous != '\\') + { + if (_builder.Length > 0) + { + State = RPState.WaitKeyConfirm; + } + else + { + State = RPState.WaitKey; + } + } + else + { + _builder.Append(ch); + } + break; + case RPState.WaitKeyConfirm: + if (ch.Equals(':')) + { + _key = _builder.ToString(); + State = RPState.WaitValue; + } + else if (ch == ' ' || ch == '\r' || ch == '\n') + { + // nothing + } + else + { + State = RPState.WaitKey; + } + break; + case RPState.WaitValue: + if (ch.Equals('"')) + { + State = RPState.ParseValue; + _builder.Clear(); + } + else if (ch == ' ' || ch == '\r' || ch == '\n') + { + // nothing + } + else + { + State = RPState.WaitKey; + } + break; + case RPState.ParseValue: + if (ch.Equals('"') && _previous != '\\') + { + if (_builder.Length > 0) + { + _value = _builder.ToString(); + _callback(_key, _value); + } + State = RPState.WaitKey; + } + else + { + _builder.Append(ch); + } + break; + } + _previous = ch; + } + } + } + + public static IEnumerable ParseBatches(int size) + { + var list = new List(); + foreach (var record in Parse()) + { + list.Add(record); + if (list.Count > size) + { + yield return list.ToArray(); + list.Clear(); + } + } + if (list.Count > 0) + { + yield return list.ToArray(); + } + } + + public static IEnumerable Parse() + { + var result = new string[2]; + var parser = new RecordParser((k, v) => + { + switch (k) + { + case "metaTitle": result[0] = v; break; + case "plaintext": result[1] = v; break; + } + }); + char[] buffer = new char[16536]; + int count = 0; + using (StreamReader reader = new StreamReader(source)) + { + count = reader.Read(buffer, 0, buffer.Length); + parser.Append(new string(buffer, 0, count)); + + if (!string.IsNullOrEmpty(result[0]) && !string.IsNullOrEmpty(result[1])) + { + yield return result; + result[0] = null; + result[1] = null; + } + } + } + } +} diff --git a/TFIDFbee/TFIDFbee/TFIDFbee.csproj b/TFIDFbee/TFIDFbee/TFIDFbee.csproj new file mode 100644 index 0000000..dc03126 --- /dev/null +++ b/TFIDFbee/TFIDFbee/TFIDFbee.csproj @@ -0,0 +1,19 @@ + + + + Exe + netcoreapp3.1 + AnyCPU;x64 + + + + + + + + + + + + + diff --git a/TestApp/Program.cs b/TestApp/Program.cs index f372b67..aa0d387 100644 --- a/TestApp/Program.cs +++ b/TestApp/Program.cs @@ -1,8 +1,8 @@ using Newtonsoft.Json; using System; +using System.IO; using ZeroLevel; using ZeroLevel.Logging; -using ZeroLevel.Services.Web; namespace TestApp { @@ -22,25 +22,6 @@ namespace TestApp private static void Main(string[] args) { - /*var fiber = new Fiber(); - fiber - .Add((s) => { Console.WriteLine("1"); s.Add("1", 1); return s; }) - .Add((s) => { Console.WriteLine("2"); s.Add("2", 2); return s; }) - .Add((s) => { Console.WriteLine("3"); s.Add("3", 3); return s; }) - .Add((s) => { Console.WriteLine("4"); s.Add("4", 4); return s; }) - .Add((s) => { Console.WriteLine("5"); s.Add("5", 5); return s; }); - - var result = fiber.Run(); - Console.WriteLine(); - Console.WriteLine("Result"); - foreach (var key in result.Keys()) - { - Console.WriteLine($"{key}: {result.Get(key)}"); - }*/ - - - - Configuration.Save(Configuration.ReadFromApplicationConfig()); Bootstrap.Startup(args, () => Configuration.ReadSetFromIniFile("config.ini")) diff --git a/TestApp/TestApp.csproj b/TestApp/TestApp.csproj index 9f8c521..3220245 100644 --- a/TestApp/TestApp.csproj +++ b/TestApp/TestApp.csproj @@ -2,7 +2,7 @@ Exe - netcoreapp2.2 + netcoreapp3.0 diff --git a/ZeroLevel/Services/Config/Configuration.cs b/ZeroLevel/Services/Config/Configuration.cs index efb4e54..0f10be4 100644 --- a/ZeroLevel/Services/Config/Configuration.cs +++ b/ZeroLevel/Services/Config/Configuration.cs @@ -36,6 +36,10 @@ namespace ZeroLevel BaseDirectory = Path.GetDirectoryName(assembly.Location); AppLocation = assembly.Location; } + else + { + BaseDirectory = Directory.GetCurrentDirectory(); + } } #endregion Ctor diff --git a/ZeroLevel/Services/FileSystem/FSUtils.cs b/ZeroLevel/Services/FileSystem/FSUtils.cs index 451c570..da1b9cf 100644 --- a/ZeroLevel/Services/FileSystem/FSUtils.cs +++ b/ZeroLevel/Services/FileSystem/FSUtils.cs @@ -306,7 +306,7 @@ namespace ZeroLevel.Services.FileSystem } public static bool IsDirectoryEmpty(string path) - { + { return !Directory.EnumerateFileSystemEntries(path).Any(); } } diff --git a/ZeroLevel/Services/Semantic/Helpers/BagOfWords.cs b/ZeroLevel/Services/Semantic/Helpers/BagOfWords.cs new file mode 100644 index 0000000..4d1ac3c --- /dev/null +++ b/ZeroLevel/Services/Semantic/Helpers/BagOfWords.cs @@ -0,0 +1,96 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using ZeroLevel.Services.Serialization; + +namespace ZeroLevel.Services.Semantic.Helpers +{ + public class BagOfWords : + IBinarySerializable + { + private readonly ConcurrentDictionary _words = + new ConcurrentDictionary(); + int _words_count = -1; + long _number_of_documents = 0; + + public long NumberOfDocuments => _number_of_documents; + public int NumberOfWords => _words.Count; + + /// + /// Набор документов, слова в документе должны быть лемматизированы/стеммированы, и быть уникальными + /// + /// + public void Learn(string[][] documents) + { + Parallel.ForEach(documents, doc => + { + Interlocked.Increment(ref _number_of_documents); + var partition = new Dictionary(); + foreach (var word in doc) + { + if (!_words.ContainsKey(word)) + { + if (false == _words.TryAdd(word, new int[2] { Interlocked.Increment(ref _words_count), 1 })) + { + Interlocked.Increment(ref _words[word][1]); + } + } + else + { + Interlocked.Increment(ref _words[word][1]); + } + } + }); + } + + /// + /// + /// + /// Документ - слова в котором должны быть лемматизированы/стеммированы, так же как в модели + /// + public SparceVector Transform(string[] doc) + { + if (doc == null || doc.Length == 0) return new SparceVector(); + var map = new Dictionary(); + foreach (var word in doc) + { + if (map.ContainsKey(word)) + { + map[word]++; + } + else + { + map[word] = 1; + } + } + var result = new Dictionary(); + foreach (var word in doc) + { + if (_words.ContainsKey(word) && !result.ContainsKey(_words[word][0])) + { + var tf = (double)map[word] / (double)doc.Length; + var idf = Math.Log(_number_of_documents / _words[word][1]); + var tfidf = tf * idf; + if (Math.Abs(tfidf) > double.Epsilon) + { + result.Add(_words[word][0], tfidf); + } + } + } + return new SparceVector(result.Values.ToArray(), result.Keys.ToArray()); + } + + public void Deserialize(IBinaryReader reader) + { + throw new NotImplementedException(); + } + + public void Serialize(IBinaryWriter writer) + { + throw new NotImplementedException(); + } + } +} diff --git a/ZeroLevel/Services/Semantic/Helpers/SparceVector.cs b/ZeroLevel/Services/Semantic/Helpers/SparceVector.cs new file mode 100644 index 0000000..c093f25 --- /dev/null +++ b/ZeroLevel/Services/Semantic/Helpers/SparceVector.cs @@ -0,0 +1,101 @@ +using System; +using System.Collections.Generic; +using ZeroLevel.Services.Serialization; + +namespace ZeroLevel.Services.Semantic.Helpers +{ + public sealed class SparceVector + : IBinarySerializable + { + private readonly static int[] EmptyIndexes = new int[0]; + private readonly static double[] EmptyValues = new double[0]; + + private int[] indexes; + private double[] values; + private double power; + + public SparceVector() + { + indexes = EmptyIndexes; + values = EmptyValues; + power = 0; + } + + public SparceVector(double[] vector) + { + var l = new List(); + for (int i = 0; i < vector.Length; i++) + { + if (Math.Abs(vector[i]) > double.Epsilon) + { + l.Add(i); + } + } + indexes = l.ToArray(); + values = new double[l.Count]; + power = 0; + for (int i = 0; i < l.Count; i++) + { + values[i] = vector[indexes[i]]; + power += values[i] * values[i]; + } + power = Math.Sqrt(power); + } + + public SparceVector(double[] vector, int[] indicies) + { + indexes = indicies; + values = vector; + power = 0; + for (int i = 0; i < indexes.Length; i++) + { + power += values[i] * values[i]; + } + power = Math.Sqrt(power); + } + + public double Measure(SparceVector other) + { + double sum = 0.0d; + + int li = 0, ri = 0; + int lv, rv; + + while (li < this.indexes.Length && + ri < other.indexes.Length) + { + lv = this.indexes[li]; + rv = other.indexes[ri]; + if (lv == rv) + { + // у обоих векторов совпадение по индексам + sum += this.values[li] * other.values[ri]; + li++; ri++; + } + else if (lv < rv) + { + li++; + } + else + { + ri++; + } + } + return sum / (this.power * other.power); + } + + public void Serialize(IBinaryWriter writer) + { + writer.WriteDouble(this.power); + writer.WriteCollection(indexes); + writer.WriteCollection(values); + } + + public void Deserialize(IBinaryReader reader) + { + this.power = reader.ReadDouble(); + this.indexes = reader.ReadInt32Collection().ToArray(); + this.values = reader.ReadDoubleCollection().ToArray(); + } + } +}