You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Zero/ZeroLevel/Services/Semantic/WordTokenizer.cs

49 lines
1.4 KiB

using System;
using System.Buffers;
using System.Collections.Generic;
namespace ZeroLevel.Services.Semantic
{
public static class WordTokenizer
{
const int ARRAY_SIZE = 2048;
static ArrayPool<char> _pool = ArrayPool<char>.Create();
public static IEnumerable<string> Tokenize(string text)
{
int index = 0;
bool first = true;
var buffer = _pool.Rent(ARRAY_SIZE);
try
{
for (int i = 0; i < text?.Length; i++)
{
if (first && Char.IsLetter(text[i]))
{
first = false;
buffer[index++] = text[i];
}
else if (first == false && Char.IsLetterOrDigit(text[i]))
{
buffer[index++] = text[i];
}
else if (index > 0)
{
yield return new string(buffer, 0, index).ToLowerInvariant();
index = 0;
first = true;
}
}
if (index > 0)
{
yield return new string(buffer, 0, index).ToLowerInvariant();
}
}
finally
{
_pool.Return(buffer);
}
}
}
}

Powered by TurnKey Linux.