using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using ZeroLevel.Services.Semantic; namespace ZeroLevel.Implementation.Semantic.Helpers { public static class TextAnalizer { internal static readonly Regex ReWord = new Regex("\\b[\\wА-Яа-я-’]+\\b", RegexOptions.Compiled | RegexOptions.IgnoreCase); /// /// Highlighting words from text /// /// Text /// Words public static IEnumerable ExtractWords(string text) { var result = new List(); foreach (Match match in ReWord.Matches(text)) { result.Add(match.Value); } return result; } /// /// Highlighting unique words from text /// /// Text /// List of unique words public static IEnumerable ExtractUniqueWords(string text) { return new HashSet(ExtractWords(text)); } /// /// Highlighting unique words from text without stop words /// /// Text /// List of unique words without stop words public static IEnumerable ExtractUniqueWordsWithoutStopWords(string text) { return new HashSet(ExtractUniqueWords(text).Where(w => StopWords.IsStopWord(w) == false)); } /// /// Extract tokens from text /// /// Text /// Tokens public static IEnumerable ExtractWordTokens(string text) { var result = new List(); foreach (Match match in ReWord.Matches(text)) { result.Add(new WordToken(match.Value, match.Index)); } return result; } /// /// Selection of unique tokens from the text (first entry) /// /// Text /// List of unique tokens public static IEnumerable ExtractUniqueWordTokens(string text) { return ExtractWordTokens(text).DistinctBy(t => t.Word); } /// /// Allocation of unique tokens from text with drop of stop words /// /// Text /// List of unique tokens without stop words public static IEnumerable ExtractUniqueWordTokensWithoutStopWords(string text) { return ExtractWordTokens(text).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false); } } }