using System; using System.Buffers; using System.Collections.Generic; namespace ZeroLevel.Services.Semantic { public static class WordTokenizer { const int ARRAY_SIZE = 2048; static ArrayPool _pool = ArrayPool.Create(); public static IEnumerable Tokenize(string text) { int index = 0; bool first = true; var buffer = _pool.Rent(ARRAY_SIZE); try { for (int i = 0; i < text?.Length; i++) { if (first && Char.IsLetter(text[i])) { first = false; buffer[index++] = text[i]; } else if (first == false && Char.IsLetterOrDigit(text[i])) { buffer[index++] = text[i]; } else if (index > 0) { yield return new string(buffer, 0, index).ToLowerInvariant(); index = 0; first = true; } } if (index > 0) { yield return new string(buffer, 0, index).ToLowerInvariant(); } } finally { _pool.Return(buffer); } } public static IEnumerable TokenizeCaseSensitive(string text) { int index = 0; bool first = true; var buffer = _pool.Rent(ARRAY_SIZE); try { for (int i = 0; i < text?.Length; i++) { if (first && Char.IsLetter(text[i])) { first = false; buffer[index++] = text[i]; } else if (first == false && Char.IsLetterOrDigit(text[i])) { buffer[index++] = text[i]; } else if (index > 0) { yield return new string(buffer, 0, index); index = 0; first = true; } } if (index > 0) { yield return new string(buffer, 0, index); } } finally { _pool.Return(buffer); } } } }