using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using ZeroLevel.Services.Semantic;
namespace ZeroLevel.Implementation.Semantic.Helpers
{
public static class TextAnalizer
{
internal static readonly Regex ReWord = new Regex("\\b[\\wА-Яа-я-’]+\\b",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
///
/// Highlighting words from text
///
/// Text
/// Words
public static IEnumerable ExtractWords(string text)
{
var result = new List();
foreach (Match match in ReWord.Matches(text))
{
result.Add(match.Value);
}
return result;
}
///
/// Highlighting unique words from text
///
/// Text
/// List of unique words
public static IEnumerable ExtractUniqueWords(string text)
{
return new HashSet(ExtractWords(text));
}
///
/// Highlighting unique words from text without stop words
///
/// Text
/// List of unique words without stop words
public static IEnumerable ExtractUniqueWordsWithoutStopWords(string text)
{
return new HashSet(ExtractUniqueWords(text).Where(w => StopWords.IsStopWord(w) == false));
}
///
/// Extract tokens from text
///
/// Text
/// Tokens
public static IEnumerable ExtractWordTokens(string text)
{
var result = new List();
foreach (Match match in ReWord.Matches(text))
{
result.Add(new WordToken(match.Value, match.Index));
}
return result;
}
public static IEnumerable ExtractWordTokens(string[] words)
{
var result = new List();
for (int i = 0; i < words.Length; i++)
{
result.Add(new WordToken(words[i], i));
}
return result;
}
///
/// Selection of unique tokens from the text (first entry)
///
/// Text
/// List of unique tokens
public static IEnumerable ExtractUniqueWordTokens(string text)
{
return ExtractWordTokens(text).DistinctBy(t => t.Word);
}
public static IEnumerable ExtractUniqueWordTokens(string[] words)
{
return ExtractWordTokens(words).DistinctBy(t => t.Word);
}
///
/// Allocation of unique tokens from text with drop of stop words
///
/// Text
/// List of unique tokens without stop words
public static IEnumerable ExtractUniqueWordTokensWithoutStopWords(string text)
{
return ExtractWordTokens(text).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false);
}
public static IEnumerable ExtractUniqueWordTokensWithoutStopWords(string[] words)
{
return ExtractWordTokens(words).DistinctBy(t => t.Word).Where(t => StopWords.IsStopWord(t.Word) == false);
}
}
}