Upd

6 years ago · bcb30bc693
parent c9b9eadad6
commit bcb30bc693
6 changed files with 165 additions and 36 deletions
--- a/TFIDFbee/TFIDFbee/Program.cs
+++ b/TFIDFbee/TFIDFbee/Program.cs
@ -1,5 +1,6 @@
 using Lemmatization;
 using System;
+using System.Collections.Concurrent;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
@ -13,28 +14,39 @@ namespace TFIDFbee
 {
    class Program
    {
-        private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204.json";
+        private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204_limit_1000.json";
        private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer());
+        private readonly static ConcurrentDictionary<string, double> _scoring = new ConcurrentDictionary<string, double>();

        static void Main(string[] args)
        {
+            var terms = new BagOfTerms("На практике эти расширения используют нечасто, особенно те расширения, которые для расчёта", _lexer);
+
+            Console.WriteLine(string.Join('-', terms.ToTokens()));
+            Console.WriteLine(string.Join('-', terms.ToUniqueTokens()));
+            Console.WriteLine(string.Join('-', terms.ToUniqueTokensWithoutStopWords()));
+            Console.WriteLine(string.Join('\n', terms.Freguency().Select(pair => $"{pair.Key}: {pair.Value}")));
+
+            
+
+            /*
            Log.AddConsoleLogger(ZeroLevel.Logging.LogLevel.FullDebug);
            Configuration.Save(Configuration.ReadFromApplicationConfig());
-            IDocumentReader reader = new StateMachineReader(source, s => ExtractLemmas(s));
+            IDocumentReader reader = new JsonByLineReader(source, s => ExtractLemmas(s));

-            BagOfWords codebook;
+            ZeroLevel.Services.Semantic.Helpers.BagOfTerms codebook;
            if (File.Exists("model.bin"))
            {
                Log.Info("Load model from file");
                using (var stream = new FileStream("model.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                {
-                    codebook = MessageSerializer.Deserialize<BagOfWords>(stream);
+                    codebook = MessageSerializer.Deserialize<ZeroLevel.Services.Semantic.Helpers.BagOfTerms>(stream);
                }
            }
            else
            {
                Log.Info("Create and train model");
-                codebook = new BagOfWords();
+                codebook = new ZeroLevel.Services.Semantic.Helpers.BagOfTerms();
                foreach (var batch in reader.ReadBatches(1000))
                {
                    codebook.Learn(batch);
@ -42,54 +54,52 @@ namespace TFIDFbee
                }
                using (var stream = new FileStream("model.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
                {
-                    MessageSerializer.Serialize<BagOfWords>(stream, codebook);
+                    MessageSerializer.Serialize<ZeroLevel.Services.Semantic.Helpers.BagOfTerms>(stream, codebook);
                }
            }

-            Log.Info("Build document vectors");
-            List<SparceVector> vectors;
-            if (File.Exists("vectors.bin"))
-            {
-                Log.Info("Load vectors from file");
-                using (var stream = new FileStream("vectors.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
-                {
-                    vectors = MessageSerializer.DeserializeCompatible<List<SparceVector>>(stream);
-                }
-            }
-            else
-            {
+
            Log.Info("Create vectors");
-                vectors = new List<SparceVector>();
+
            foreach (var docs in reader.ReadRawDocumentBatches(1000))
            {
                foreach (var doc in docs)
                {
-                        var words = _lexer.ExtractLexTokens(doc.Item2).Select(t => t.Token).Concat(_lexer.ExtractLexTokens(doc.Item1).Select(t => t.Token)).ToArray();
-                        vectors.Add(codebook.Transform(words));
+                    var words = ExtractLemmas(doc.Item2).Concat(ExtractLemmas(doc.Item1)).Distinct().ToArray();
+                    var vector = codebook.Transform(words);
+                    for (var i = 0; i< words.Length; i++)
+                    {
+                        var word = words[i];
+                        if (false == _scoring.ContainsKey(word))
+                        {
+                            _scoring.TryAdd(word, vector)
+                        }
+                    }
                }
            }
            using (var stream = new FileStream("vectors.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite))
            {
                MessageSerializer.SerializeCompatible<List<SparceVector>>(stream, vectors);
            }
-            }
+

            Log.Info("Find similar documents");
            var list = new List<Tuple<double, int, int>>();
-            long total_count = (vectors.Count * vectors.Count);
+            long total_count = ((long)vectors.Count * (long)vectors.Count);
            long count = 0;
+            double d = (double.Epsilon * 2.0d);
            for (int i = 0; i < vectors.Count; i++)
            {
                for (int j = i + 1; j < vectors.Count - 1; j++)
                {
                    count++;
-                    if (count % 100000 == 0)
+                    if (count % 10000000 == 0)
                    {
-                        Log.Info($"Progress: {(int)(count * 100.0d / (double)total_count)} %.\tFound similars: {list.Count}.");
+                        Log.Info($"Progress: {((count * 100.0d) / total_count)} %.\tFound similars: {list.Count}.");
                    }
                    if (i == j) continue;
                    var diff = vectors[i].Measure(vectors[j]);
-                    if (diff > 0.885d)
+                    if (diff > d && diff < 0.0009d)
                    {
                        list.Add(Tuple.Create(diff, i, j));
                    }
@ -141,7 +151,7 @@ namespace TFIDFbee
                    output.WriteLine();
                }
            }
-
+            */
            Console.WriteLine("Completed");
            Console.ReadKey();
        }
--- a/ZeroLevel/Services/Semantic/Contracts/ILexProvider.cs
+++ b/ZeroLevel/Services/Semantic/Contracts/ILexProvider.cs
@ -10,18 +10,24 @@ namespace ZeroLevel.Services.Semantic
        /// <returns>Spisok tokenov</returns>
        IEnumerable<LexToken> ExtractLexTokens(string text);

+        IEnumerable<LexToken> ExtractLexTokens(string[] words);
+
        /// <summary>
        /// Selecting unique tokens from text
        /// </summary>
        /// <returns>Tokens</returns>
        IEnumerable<LexToken> ExtractUniqueLexTokens(string text);

+        IEnumerable<LexToken> ExtractUniqueLexTokens(string[] words);
+
        /// <summary>
        /// Allocation of unique tokens from text with drop of stop words
        /// </summary>
        /// <returns>Tokens</returns>
        IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string text);

+        IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string[] words);
+
        /// <summary>
        /// Search for tokens in the text corresponding to the specified words (full-text search)
        /// </summary>
--- a/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs
+++ b/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs
@ -1,14 +1,84 @@
-using System;
+using Iveonik.Stemmers;
+using System;
 using System.Collections.Concurrent;
 using System.Collections.Generic;
 using System.Linq;
 using System.Threading;
 using System.Threading.Tasks;
+using ZeroLevel.Implementation.Semantic.Helpers;
 using ZeroLevel.Services.Serialization;

 namespace ZeroLevel.Services.Semantic.Helpers
 {
-    public class BagOfWords :
+    public class BagOfTerms
+    {
+        private string[] _words;
+        private ILexProvider _lexer;
+
+        public BagOfTerms(string text) : this(TextAnalizer.ExtractWords(text).ToArray(), new LexProvider(new RussianStemmer())) { }
+
+        public BagOfTerms(string text, ILexProvider lexer) : this(TextAnalizer.ExtractWords(text).ToArray(), lexer) { }
+
+        public BagOfTerms(IEnumerable<string> words) : this(words.ToArray(), new LexProvider(new RussianStemmer())) { }
+
+        public BagOfTerms(IEnumerable<string> words, ILexProvider lexer) : this(words.ToArray(), lexer) { }
+
+        public BagOfTerms(string[] words) : this(words, new LexProvider(new RussianStemmer())) { }
+
+        public BagOfTerms(string[] words, ILexProvider lexer)
+        {            
+            _lexer = lexer;
+            _frequency = null;
+            _words = _lexer.ExtractLexTokens(words).Select(t => t.Token).ToArray();
+        }
+
+        public string[] Words => _words;
+
+        private IDictionary<string, int> _frequency;
+
+        public IDictionary<string, int> Freguency()
+        {
+            if (_frequency == null)
+            {
+                var frequency = new Dictionary<string, int>();
+                for (int i = 0; i < _words.Length; i++)
+                {
+                    if (frequency.ContainsKey(_words[i]))
+                    {
+                        frequency[_words[i]]++;
+                    }
+                    else
+                    {
+                        frequency[_words[i]] = 1;
+                    }
+                }
+                _frequency = frequency;
+            }
+            return _frequency;
+        }
+
+        public string[] ToTokens()
+        {
+            return _words;
+        }
+
+        public string[] ToUniqueTokens()
+        {
+            return _words.DistinctBy(s => s)
+                .ToArray();
+        }
+
+        public string[] ToUniqueTokensWithoutStopWords()
+        {
+            return _words.Where(w => StopWords.IsStopWord(w) == false)
+                .DistinctBy(s => s)
+                .ToArray();
+        }
+    }
+
+
+
+    public class BagOfWords1 :
        IBinarySerializable
    {
        private ConcurrentDictionary<string, int[]> _words;
@ -18,7 +88,7 @@ namespace ZeroLevel.Services.Semantic.Helpers
        public long NumberOfDocuments => _number_of_documents;
        public int NumberOfWords => _words.Count;

-        public BagOfWords() =>
+        public BagOfWords1() =>
            _words = new ConcurrentDictionary<string, int[]>();

        /// <summary>
@ -74,7 +144,7 @@ namespace ZeroLevel.Services.Semantic.Helpers
                if (_words.ContainsKey(word) && !result.ContainsKey(_words[word][0]))
                {
                    var tf = (double)map[word] / (double)doc.Length;
-                    var idf = Math.Log(_number_of_documents / _words[word][1]);
+                    var idf = Math.Log(1 + (_number_of_documents / _words[word][1]));
                    var tfidf = tf * idf;
                    if (Math.Abs(tfidf) > double.Epsilon)
                    {
--- a/ZeroLevel/Services/Semantic/Helpers/SparceVector.cs
+++ b/ZeroLevel/Services/Semantic/Helpers/SparceVector.cs
@ -14,6 +14,8 @@ namespace ZeroLevel.Services.Semantic.Helpers
        private double[] values;
        private double power;

+
+
        public SparceVector() 
        {
            indexes = EmptyIndexes;
--- a/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs
+++ b/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs
@ -62,6 +62,16 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
            return result;
        }

+        public static IEnumerable<WordToken> ExtractWordTokens(string[] words)
+        {
+            var result = new List<WordToken>();
+            for (int i = 0; i < words.Length; i++)
+            {
+                result.Add(new WordToken(words[i], i));
+            }
+            return result;
+        }
+
        /// <summary>
        /// Selection of unique tokens from the text (first entry)
        /// </summary>
@ -72,6 +82,11 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
            return ExtractWordTokens(text).DistinctBy(t => t.Word);
        }

+        public static IEnumerable<WordToken> ExtractUniqueWordTokens(string[] words)
+        {
+            return ExtractWordTokens(words).DistinctBy(t => t.Word);
+        }
+
        /// <summary>
        /// Allocation of unique tokens from text with drop of stop words
        /// </summary>
@ -81,5 +96,10 @@ namespace ZeroLevel.Implementation.Semantic.Helpers
        {
            return ExtractWordTokens(text).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false);
        }
+
+        public static IEnumerable<WordToken> ExtractUniqueWordTokensWithoutStopWords(string[] words)
+        {
+            return ExtractWordTokens(words).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false);
+        }
    }
 }
--- a/ZeroLevel/Services/Semantic/LexProvider.cs
+++ b/ZeroLevel/Services/Semantic/LexProvider.cs
@ -25,7 +25,16 @@ namespace ZeroLevel.Services.Semantic
            {
                result.Add(new LexToken(match.Value, _lexer.Lex(match.Value), match.Index));
            }
+            return result;
+        }

+        public IEnumerable<LexToken> ExtractLexTokens(string[] words)
+        {
+            var result = new List<LexToken>();
+            for(int i=0; i < words.Length; i++)
+            {
+                result.Add(new LexToken(words[i], _lexer.Lex(words[i]), i));
+            }
            return result;
        }

@ -35,12 +44,24 @@ namespace ZeroLevel.Services.Semantic
                .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
        }

+        public IEnumerable<LexToken> ExtractUniqueLexTokens(string[] words)
+        {
+            return TextAnalizer.ExtractUniqueWordTokens(words)
+                .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
+        }
+
        public IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string text)
        {
            return TextAnalizer.ExtractUniqueWordTokensWithoutStopWords(text)
                .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
        }

+        public IEnumerable<LexToken> ExtractUniqueLexTokensWithoutStopWords(string[] words)
+        {
+            return TextAnalizer.ExtractUniqueWordTokensWithoutStopWords(words)
+                .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token);
+        }
+
        public IDictionary<string, IEnumerable<LexToken>> SearchLexTokensByWords(string text, string[] words)
        {
            var result = new Dictionary<string, IEnumerable<LexToken>>();