From 0b4de1664098e19a486c94360a556a9d709e76a1 Mon Sep 17 00:00:00 2001 From: Ogoun Date: Mon, 25 Mar 2019 21:06:33 +0300 Subject: [PATCH] Add Ngramm builder --- .../Services/Semantic/Helpers/NGramms.cs | 81 +++++++++++++++++++ ZeroLevel/ZeroLevel.csproj | 1 + 2 files changed, 82 insertions(+) create mode 100644 ZeroLevel/Services/Semantic/Helpers/NGramms.cs diff --git a/ZeroLevel/Services/Semantic/Helpers/NGramms.cs b/ZeroLevel/Services/Semantic/Helpers/NGramms.cs new file mode 100644 index 0000000..7b72ed8 --- /dev/null +++ b/ZeroLevel/Services/Semantic/Helpers/NGramms.cs @@ -0,0 +1,81 @@ +using System; +using System.Collections.Generic; +using ZeroLevel.Services.Collections; + +/* + Example with text lines + var freg_dict = BuildNGramm(File.ReadAllLines("samples.txt"), line => _provider.ExtractLexTokens(line).Where(w => StopWords.IsStopWord(w.Token) == false).Select(t => t.Token.ToLowerInvariant()), 2); + + Example with sentences + var freg_dict = BuildNGramm(TAStringReader.ReadSentenses(File.ReadAllText("samples.txt")), sent => sent.Tokens.Select(t => t.Value).Where(w => StopWords.IsStopWord(w) == false), 3); +*/ + +namespace ZeroLevel.Services.Semantic.Helpers +{ + public static class NGramms + { + public static Dictionary BuildNGramm(IEnumerable input, Func> tokenizer, int N) + { + var ngramms = new Dictionary(); + var arr = new FixSizeQueue(N); + foreach (var line in input) + { + foreach (var token in tokenizer(line)) + { + arr.Push(token); + if (arr.Count == N) + { + var currentPrase = string.Join(" ", arr.Dump()); + if (ngramms.ContainsKey(currentPrase)) ngramms[currentPrase]++; + else ngramms.Add(currentPrase, 1); + } + } + while (arr.Count > 0) + arr.Take(); + } + return ngramms; + } + + public static Dictionary GetUnigramms(IEnumerable input, Func> tokenizer) + => BuildNGramm(input, tokenizer, 1); + + public static Dictionary GetBigramms(IEnumerable input, Func> tokenizer) + => BuildNGramm(input, tokenizer, 2); + + public static Dictionary GetTrigramms(IEnumerable input, Func> tokenizer) + => BuildNGramm(input, tokenizer, 3); + + + public static Dictionary BuildNGramm(IEnumerable input, Func> tokenizer, int N) + { + var ngramms = new Dictionary(); + var arr = new FixSizeQueue(N); + foreach (var item in input) + { + foreach (var token in tokenizer(item)) + { + arr.Push(token); + if (arr.Count == N) + { + var currentPrase = string.Join(" ", arr.Dump()); + if (ngramms.ContainsKey(currentPrase)) ngramms[currentPrase]++; + else ngramms.Add(currentPrase, 1); + } + } + while (arr.Count > 0) + arr.Take(); + } + return ngramms; + } + + public static Dictionary GetUnigramms(IEnumerable input, Func> tokenizer) + => BuildNGramm(input, tokenizer, 1); + + public static Dictionary GetBigramms(IEnumerable input, Func> tokenizer) + => BuildNGramm(input, tokenizer, 2); + + public static Dictionary GetTrigramms(IEnumerable input, Func> tokenizer) + => BuildNGramm(input, tokenizer, 3); + + } +} diff --git a/ZeroLevel/ZeroLevel.csproj b/ZeroLevel/ZeroLevel.csproj index 8897fa2..20beaad 100644 --- a/ZeroLevel/ZeroLevel.csproj +++ b/ZeroLevel/ZeroLevel.csproj @@ -294,6 +294,7 @@ +