using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using ZeroLevel.Services.Semantic; namespace ZeroLevel.Implementation.Semantic.Helpers { public static class TextAnalizer { internal static readonly Regex ReWord = new Regex("\\b[\\wА-Яа-я-’]+\\b", RegexOptions.Compiled | RegexOptions.IgnoreCase); internal static readonly Regex ReRuWord = new Regex("\\b[А-Яа-я-]+\\b", RegexOptions.Compiled | RegexOptions.IgnoreCase); /// /// Highlighting words from text /// /// Text /// Words public static IEnumerable ExtractWords(string text) { foreach (Match match in ReWord.Matches(text)) { yield return match.Value; } } public static IEnumerable ExtractRuWords(string text) { foreach (Match match in ReRuWord.Matches(text)) { yield return match.Value; } } /// /// Highlighting unique words from text /// /// Text /// List of unique words public static IEnumerable ExtractUniqueWords(string text) { return new HashSet(ExtractWords(text)); } /// /// Highlighting unique words from text without stop words /// /// Text /// List of unique words without stop words public static IEnumerable ExtractUniqueWordsWithoutStopWords(string text) { return new HashSet(ExtractUniqueWords(text).Where(w => StopWords.IsStopWord(w) == false)); } /// /// Extract tokens from text /// /// Text /// Tokens public static IEnumerable ExtractWordTokens(string text) { foreach (Match match in ReWord.Matches(text)) { yield return new WordToken(match.Value, match.Index); } } public static IEnumerable ExtractWordTokens(string[] words) { for (int i = 0; i < words.Length; i++) { yield return new WordToken(words[i], i); } } /// /// Selection of unique tokens from the text (first entry) /// /// Text /// List of unique tokens public static IEnumerable ExtractUniqueWordTokens(string text) { return ExtractWordTokens(text).DistinctBy(t => t.Word); } public static IEnumerable ExtractUniqueWordTokens(string[] words) { return ExtractWordTokens(words).DistinctBy(t => t.Word); } /// /// Allocation of unique tokens from text with drop of stop words /// /// Text /// List of unique tokens without stop words public static IEnumerable ExtractUniqueWordTokensWithoutStopWords(string text) { return ExtractWordTokens(text).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false); } public static IEnumerable ExtractUniqueWordTokensWithoutStopWords(string[] words) { return ExtractWordTokens(words).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false); } } }