using System; using System.Collections.Generic; namespace Lemmatization { public class SpecTextReader { private int _position; private readonly string _template; public bool EOF => _position >= _template?.Length; public bool StartPosition => _position == 0; public bool LastPosition => _position == _template?.Length - 1; public char Current => EOF ? char.MinValue : _template[_position]; public char Next => EOF || LastPosition ? char.MinValue : _template[_position + 1]; public char Preview => StartPosition ? char.MinValue : _template[_position - 1]; public SpecTextReader(string template) { _template = template; _position = 0; } public bool Move(int offset = 1) { if (EOF) return false; if (LastPosition) { _position = _template.Length; return false; } _position += offset; if (_position >= _template.Length) { _position = _template.Length; } return true; } public int SkipSpaces() { int count = 0; while (EOF == false && char.IsWhiteSpace(Current)) { Move(); count++; } return count; } public void SkipBreaks() { while (EOF == false && char.IsWhiteSpace(Current)) Move(); } public bool MoveBack() { _position = _position - 1; if (_position < 0) { _position = 0; return false; } return true; } public int FindOffsetTo(char symbol) { if (_position == -1 || EOF || LastPosition) return -1; var search_position = _position; var sym = _template[search_position]; while (search_position < _template.Length && false == sym.Equals(symbol)) { search_position++; sym = _template[search_position]; } return sym.Equals(symbol) ? search_position - _position : -1; } public bool Test(char sym, int offset = 0) { var index = _position + offset; if (index < 0 || index >= _template.Length) return false; return _template[index].Equals(sym); } public string ReadIdentity() { string identity = string.Empty; var offset = _position; if (offset < _template.Length && char.IsLetter(_template[offset])) { var index = offset + 1; while (index < _template.Length && (char.IsLetterOrDigit(_template[index]) || _template[index] == '_' || _template[index] == '-')) index++; identity = _template.Substring(offset, index - offset); } return identity.ToLowerInvariant(); } public string ReadWord() { string identity = string.Empty; var offset = _position; if (offset < _template.Length && char.IsLetterOrDigit(_template[offset])) { var index = offset + 1; while (index < _template.Length && char.IsLetterOrDigit(_template[index])) index++; identity = _template.Substring(offset, index - offset); } return identity; } public static Token[] ParseToTokens(string line) { var list = new List(); char[] buffer = new char[64]; int count = 0; var add = new Action(ch => { buffer[count++] = ch; if (buffer.Length == count) { // При нехватке места в буфере, расширяем в два раза место var arr = new char[buffer.Length * 2]; for (var k = 0; k < buffer.Length; k++) { arr[k] = buffer[k]; } buffer = arr; } }); TokenType tt = TokenType.Unknown; for (int i = 0; i < line.Length; i++) { if (char.IsLetter(line[i])) { if (tt == TokenType.Unknown) tt = TokenType.Word; else if (tt == TokenType.Number) tt = TokenType.Identity; add(line[i]); } else if (char.IsDigit(line[i])) { if (tt == TokenType.Unknown) tt = TokenType.Number; else if (tt == TokenType.Word) tt = TokenType.Identity; add(line[i]); } else if (char.IsWhiteSpace(line[i]) && tt != TokenType.Unknown) { if (count > 0) { list.Add(new Token { Type = tt, Value = new string(buffer, 0, count) }); count = 0; } } else { if (count > 0) { list.Add(new Token { Type = tt, Value = new string(buffer, 0, count) }); count = 0; } if (char.IsWhiteSpace(line[i]) == false) { list.Add(new Token { Type = TokenType.Punctuation, Value = line[i].ToString() }); } } } if (count > 0) { list.Add(new Token { Type = tt, Value = new string(buffer, 0, count) }); } return list.ToArray(); } public static IEnumerable ReadSentenses(string text) { if (false == string.IsNullOrEmpty(text)) { char[] buffer = new char[512]; int count = 0; var add = new Action(ch => { buffer[count++] = ch; if (buffer.Length == count) { // При нехватке места в буфере, расширяем в два раза место var arr = new char[buffer.Length * 2]; for (var k = 0; k < buffer.Length; k++) { arr[k] = buffer[k]; } buffer = arr; } }); for (int i = 0; i < text.Length; i++) { switch (text[i]) { case '.': if (count > 0) { yield return new Sentence { Tokens = ParseToTokens(new string(buffer, 0, count)) }; count = 0; } break; default: add(text[i]); break; } } } } } }