You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Zero/ZeroLevel/Services/Semantic/WordTokenizer.cs

48 lines
1.4 KiB

5 years ago
using System;
using System.Collections.Generic;
using ZeroLevel.Services.Pools;
namespace ZeroLevel.Services.Semantic
{
public static class WordTokenizer
{
static ObjectPool<char[]> _pool = new ObjectPool<char[]>(() => new char[2048]);
public static IEnumerable<string> Tokenize(string text)
{
int index = 0;
bool first = true;
var buffer = _pool.Allocate();
try
{
5 years ago
for (int i = 0; i < text?.Length; i++)
5 years ago
{
if (first && Char.IsLetter(text[i]))
{
first = false;
buffer[index++] = text[i];
}
else if (first == false && Char.IsLetterOrDigit(text[i]))
{
buffer[index++] = text[i];
}
else if (index > 0)
{
yield return new string(buffer, 0, index).ToLowerInvariant();
index = 0;
first = true;
}
}
if (index > 0)
{
yield return new string(buffer, 0, index).ToLowerInvariant();
}
}
finally
{
_pool.Free(buffer);
}
}
}
}

Powered by TurnKey Linux.