using System; using System.Collections.Generic; using ZeroLevel.Services.Collections; /* Example with text lines var freg_dict = BuildNGramm(File.ReadAllLines("samples.txt"), line => _provider.ExtractLexTokens(line).Where(w => StopWords.IsStopWord(w.Token) == false).Select(t => t.Token.ToLowerInvariant()), 2); Example with sentences var freg_dict = BuildNGramm(TAStringReader.ReadSentenses(File.ReadAllText("samples.txt")), sent => sent.Tokens.Select(t => t.Value).Where(w => StopWords.IsStopWord(w) == false), 3); */ namespace ZeroLevel.Services.Semantic.Helpers { public static class NGramms { public static Dictionary BuildNGramm(IEnumerable input, Func> tokenizer, int N) { var ngramms = new Dictionary(); var arr = new FixSizeQueue(N); foreach (var line in input) { foreach (var token in tokenizer(line)) { arr.Push(token); if (arr.Count == N) { var currentPrase = string.Join(" ", arr.Dump()); if (ngramms.ContainsKey(currentPrase)) ngramms[currentPrase]++; else ngramms.Add(currentPrase, 1); } } while (arr.Count > 0) arr.Take(); } return ngramms; } public static Dictionary GetUnigramms(IEnumerable input, Func> tokenizer) => BuildNGramm(input, tokenizer, 1); public static Dictionary GetBigramms(IEnumerable input, Func> tokenizer) => BuildNGramm(input, tokenizer, 2); public static Dictionary GetTrigramms(IEnumerable input, Func> tokenizer) => BuildNGramm(input, tokenizer, 3); public static Dictionary BuildNGramm(IEnumerable input, Func> tokenizer, int N) { var ngramms = new Dictionary(); var arr = new FixSizeQueue(N); foreach (var item in input) { foreach (var token in tokenizer(item)) { arr.Push(token); if (arr.Count == N) { var currentPrase = string.Join(" ", arr.Dump()); if (ngramms.ContainsKey(currentPrase)) ngramms[currentPrase]++; else ngramms.Add(currentPrase, 1); } } while (arr.Count > 0) arr.Take(); } return ngramms; } public static Dictionary GetUnigramms(IEnumerable input, Func> tokenizer) => BuildNGramm(input, tokenizer, 1); public static Dictionary GetBigramms(IEnumerable input, Func> tokenizer) => BuildNGramm(input, tokenizer, 2); public static Dictionary GetTrigramms(IEnumerable input, Func> tokenizer) => BuildNGramm(input, tokenizer, 3); } }