diff --git a/TFIDFbee/TFIDFbee/Program.cs b/TFIDFbee/TFIDFbee/Program.cs index e47aa19..a140eed 100644 --- a/TFIDFbee/TFIDFbee/Program.cs +++ b/TFIDFbee/TFIDFbee/Program.cs @@ -1,5 +1,6 @@ using Lemmatization; using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; using System.Linq; @@ -13,28 +14,39 @@ namespace TFIDFbee { class Program { - private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204.json"; + private const string source = @"E:\Desktop\lenta-ru-data-set_19990901_20171204\lenta-ru-data-set_19990901_20171204_limit_1000.json"; private readonly static ILexProvider _lexer = new LexProvider(new LemmaLexer()); + private readonly static ConcurrentDictionary _scoring = new ConcurrentDictionary(); static void Main(string[] args) { + var terms = new BagOfTerms("На практике эти расширения используют нечасто, особенно те расширения, которые для расчёта", _lexer); + + Console.WriteLine(string.Join('-', terms.ToTokens())); + Console.WriteLine(string.Join('-', terms.ToUniqueTokens())); + Console.WriteLine(string.Join('-', terms.ToUniqueTokensWithoutStopWords())); + Console.WriteLine(string.Join('\n', terms.Freguency().Select(pair => $"{pair.Key}: {pair.Value}"))); + + + + /* Log.AddConsoleLogger(ZeroLevel.Logging.LogLevel.FullDebug); Configuration.Save(Configuration.ReadFromApplicationConfig()); - IDocumentReader reader = new StateMachineReader(source, s => ExtractLemmas(s)); + IDocumentReader reader = new JsonByLineReader(source, s => ExtractLemmas(s)); - BagOfWords codebook; + ZeroLevel.Services.Semantic.Helpers.BagOfTerms codebook; if (File.Exists("model.bin")) { Log.Info("Load model from file"); using (var stream = new FileStream("model.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { - codebook = MessageSerializer.Deserialize(stream); + codebook = MessageSerializer.Deserialize(stream); } } else { Log.Info("Create and train model"); - codebook = new BagOfWords(); + codebook = new ZeroLevel.Services.Semantic.Helpers.BagOfTerms(); foreach (var batch in reader.ReadBatches(1000)) { codebook.Learn(batch); @@ -42,54 +54,52 @@ namespace TFIDFbee } using (var stream = new FileStream("model.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite)) { - MessageSerializer.Serialize(stream, codebook); + MessageSerializer.Serialize(stream, codebook); } } - Log.Info("Build document vectors"); - List vectors; - if (File.Exists("vectors.bin")) - { - Log.Info("Load vectors from file"); - using (var stream = new FileStream("vectors.bin", FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) - { - vectors = MessageSerializer.DeserializeCompatible>(stream); - } - } - else + + Log.Info("Create vectors"); + + foreach (var docs in reader.ReadRawDocumentBatches(1000)) { - Log.Info("Create vectors"); - vectors = new List(); - foreach (var docs in reader.ReadRawDocumentBatches(1000)) + foreach (var doc in docs) { - foreach (var doc in docs) + var words = ExtractLemmas(doc.Item2).Concat(ExtractLemmas(doc.Item1)).Distinct().ToArray(); + var vector = codebook.Transform(words); + for (var i = 0; i< words.Length; i++) { - var words = _lexer.ExtractLexTokens(doc.Item2).Select(t => t.Token).Concat(_lexer.ExtractLexTokens(doc.Item1).Select(t => t.Token)).ToArray(); - vectors.Add(codebook.Transform(words)); + var word = words[i]; + if (false == _scoring.ContainsKey(word)) + { + _scoring.TryAdd(word, vector) + } } } - using (var stream = new FileStream("vectors.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite)) - { - MessageSerializer.SerializeCompatible>(stream, vectors); - } + } + using (var stream = new FileStream("vectors.bin", FileMode.Create, FileAccess.Write, FileShare.ReadWrite)) + { + MessageSerializer.SerializeCompatible>(stream, vectors); } + Log.Info("Find similar documents"); var list = new List>(); - long total_count = (vectors.Count * vectors.Count); + long total_count = ((long)vectors.Count * (long)vectors.Count); long count = 0; + double d = (double.Epsilon * 2.0d); for (int i = 0; i < vectors.Count; i++) { for (int j = i + 1; j < vectors.Count - 1; j++) { count++; - if (count % 100000 == 0) + if (count % 10000000 == 0) { - Log.Info($"Progress: {(int)(count * 100.0d / (double)total_count)} %.\tFound similars: {list.Count}."); + Log.Info($"Progress: {((count * 100.0d) / total_count)} %.\tFound similars: {list.Count}."); } if (i == j) continue; var diff = vectors[i].Measure(vectors[j]); - if (diff > 0.885d) + if (diff > d && diff < 0.0009d) { list.Add(Tuple.Create(diff, i, j)); } @@ -141,7 +151,7 @@ namespace TFIDFbee output.WriteLine(); } } - + */ Console.WriteLine("Completed"); Console.ReadKey(); } diff --git a/ZeroLevel/Services/Semantic/Contracts/ILexProvider.cs b/ZeroLevel/Services/Semantic/Contracts/ILexProvider.cs index 685c6cd..2454d1e 100644 --- a/ZeroLevel/Services/Semantic/Contracts/ILexProvider.cs +++ b/ZeroLevel/Services/Semantic/Contracts/ILexProvider.cs @@ -10,18 +10,24 @@ namespace ZeroLevel.Services.Semantic /// Spisok tokenov IEnumerable ExtractLexTokens(string text); + IEnumerable ExtractLexTokens(string[] words); + /// /// Selecting unique tokens from text /// /// Tokens IEnumerable ExtractUniqueLexTokens(string text); + IEnumerable ExtractUniqueLexTokens(string[] words); + /// /// Allocation of unique tokens from text with drop of stop words /// /// Tokens IEnumerable ExtractUniqueLexTokensWithoutStopWords(string text); + IEnumerable ExtractUniqueLexTokensWithoutStopWords(string[] words); + /// /// Search for tokens in the text corresponding to the specified words (full-text search) /// diff --git a/ZeroLevel/Services/Semantic/Helpers/BagOfWords.cs b/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs similarity index 60% rename from ZeroLevel/Services/Semantic/Helpers/BagOfWords.cs rename to ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs index 6fd4ecc..0e7cbc5 100644 --- a/ZeroLevel/Services/Semantic/Helpers/BagOfWords.cs +++ b/ZeroLevel/Services/Semantic/Helpers/BagOfTerms.cs @@ -1,14 +1,84 @@ -using System; +using Iveonik.Stemmers; +using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading; using System.Threading.Tasks; +using ZeroLevel.Implementation.Semantic.Helpers; using ZeroLevel.Services.Serialization; namespace ZeroLevel.Services.Semantic.Helpers { - public class BagOfWords : + public class BagOfTerms + { + private string[] _words; + private ILexProvider _lexer; + + public BagOfTerms(string text) : this(TextAnalizer.ExtractWords(text).ToArray(), new LexProvider(new RussianStemmer())) { } + + public BagOfTerms(string text, ILexProvider lexer) : this(TextAnalizer.ExtractWords(text).ToArray(), lexer) { } + + public BagOfTerms(IEnumerable words) : this(words.ToArray(), new LexProvider(new RussianStemmer())) { } + + public BagOfTerms(IEnumerable words, ILexProvider lexer) : this(words.ToArray(), lexer) { } + + public BagOfTerms(string[] words) : this(words, new LexProvider(new RussianStemmer())) { } + + public BagOfTerms(string[] words, ILexProvider lexer) + { + _lexer = lexer; + _frequency = null; + _words = _lexer.ExtractLexTokens(words).Select(t => t.Token).ToArray(); + } + + public string[] Words => _words; + + private IDictionary _frequency; + + public IDictionary Freguency() + { + if (_frequency == null) + { + var frequency = new Dictionary(); + for (int i = 0; i < _words.Length; i++) + { + if (frequency.ContainsKey(_words[i])) + { + frequency[_words[i]]++; + } + else + { + frequency[_words[i]] = 1; + } + } + _frequency = frequency; + } + return _frequency; + } + + public string[] ToTokens() + { + return _words; + } + + public string[] ToUniqueTokens() + { + return _words.DistinctBy(s => s) + .ToArray(); + } + + public string[] ToUniqueTokensWithoutStopWords() + { + return _words.Where(w => StopWords.IsStopWord(w) == false) + .DistinctBy(s => s) + .ToArray(); + } + } + + + + public class BagOfWords1 : IBinarySerializable { private ConcurrentDictionary _words; @@ -18,7 +88,7 @@ namespace ZeroLevel.Services.Semantic.Helpers public long NumberOfDocuments => _number_of_documents; public int NumberOfWords => _words.Count; - public BagOfWords() => + public BagOfWords1() => _words = new ConcurrentDictionary(); /// @@ -74,7 +144,7 @@ namespace ZeroLevel.Services.Semantic.Helpers if (_words.ContainsKey(word) && !result.ContainsKey(_words[word][0])) { var tf = (double)map[word] / (double)doc.Length; - var idf = Math.Log(_number_of_documents / _words[word][1]); + var idf = Math.Log(1 + (_number_of_documents / _words[word][1])); var tfidf = tf * idf; if (Math.Abs(tfidf) > double.Epsilon) { diff --git a/ZeroLevel/Services/Semantic/Helpers/SparceVector.cs b/ZeroLevel/Services/Semantic/Helpers/SparceVector.cs index c093f25..96c129c 100644 --- a/ZeroLevel/Services/Semantic/Helpers/SparceVector.cs +++ b/ZeroLevel/Services/Semantic/Helpers/SparceVector.cs @@ -14,6 +14,8 @@ namespace ZeroLevel.Services.Semantic.Helpers private double[] values; private double power; + + public SparceVector() { indexes = EmptyIndexes; diff --git a/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs b/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs index 1dc9aeb..cfefb13 100644 --- a/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs +++ b/ZeroLevel/Services/Semantic/Helpers/TextAnalizer.cs @@ -62,6 +62,16 @@ namespace ZeroLevel.Implementation.Semantic.Helpers return result; } + public static IEnumerable ExtractWordTokens(string[] words) + { + var result = new List(); + for (int i = 0; i < words.Length; i++) + { + result.Add(new WordToken(words[i], i)); + } + return result; + } + /// /// Selection of unique tokens from the text (first entry) /// @@ -72,6 +82,11 @@ namespace ZeroLevel.Implementation.Semantic.Helpers return ExtractWordTokens(text).DistinctBy(t => t.Word); } + public static IEnumerable ExtractUniqueWordTokens(string[] words) + { + return ExtractWordTokens(words).DistinctBy(t => t.Word); + } + /// /// Allocation of unique tokens from text with drop of stop words /// @@ -81,5 +96,10 @@ namespace ZeroLevel.Implementation.Semantic.Helpers { return ExtractWordTokens(text).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false); } + + public static IEnumerable ExtractUniqueWordTokensWithoutStopWords(string[] words) + { + return ExtractWordTokens(words).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false); + } } } \ No newline at end of file diff --git a/ZeroLevel/Services/Semantic/LexProvider.cs b/ZeroLevel/Services/Semantic/LexProvider.cs index b81230e..2dcbd2c 100644 --- a/ZeroLevel/Services/Semantic/LexProvider.cs +++ b/ZeroLevel/Services/Semantic/LexProvider.cs @@ -25,7 +25,16 @@ namespace ZeroLevel.Services.Semantic { result.Add(new LexToken(match.Value, _lexer.Lex(match.Value), match.Index)); } + return result; + } + public IEnumerable ExtractLexTokens(string[] words) + { + var result = new List(); + for(int i=0; i < words.Length; i++) + { + result.Add(new LexToken(words[i], _lexer.Lex(words[i]), i)); + } return result; } @@ -35,12 +44,24 @@ namespace ZeroLevel.Services.Semantic .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token); } + public IEnumerable ExtractUniqueLexTokens(string[] words) + { + return TextAnalizer.ExtractUniqueWordTokens(words) + .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token); + } + public IEnumerable ExtractUniqueLexTokensWithoutStopWords(string text) { return TextAnalizer.ExtractUniqueWordTokensWithoutStopWords(text) .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token); } + public IEnumerable ExtractUniqueLexTokensWithoutStopWords(string[] words) + { + return TextAnalizer.ExtractUniqueWordTokensWithoutStopWords(words) + .Select(w => new LexToken(w.Word, _lexer.Lex(w.Word), w.Position)).DistinctBy(s => s.Token); + } + public IDictionary> SearchLexTokensByWords(string text, string[] words) { var result = new Dictionary>();