You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

373 lines
23 KiB

6 years ago
* Port of Snowball stemmers on C#
* Original stemmers can be found on
* Licence still BSD:
6 years ago
* Most of stemmers are ported from Java by Iveonik Systems ltd. (
6 years ago
* German stemmer's port found on SourceForge site
6 years ago
using System;
using System.Threading;
using ZeroLevel.Services.Semantic;
namespace Iveonik.Stemmers
public class GermanStemmer : ILexer
6 years ago
private char[] word_buffer;
private int STR_SIZE, R1, R2;
private int BUFFER_SIZE;
private const int INC = 20; // I found out this is optimal word string size
public GermanStemmer()
private void SetInitState()
R1 = R2 = -1; STR_SIZE = 0;
5 months ago
word_buffer = null!;
6 years ago
word_buffer = new char[BUFFER_SIZE];
private void Increment()
char[] tmp_buffer = word_buffer;
word_buffer = new char[BUFFER_SIZE];
tmp_buffer.CopyTo(word_buffer, 0);
5 months ago
tmp_buffer = null!;
6 years ago
// is char vowel
* The following letters are vowels:
* a e i o u y <EFBFBD> <EFBFBD> <EFBFBD>
6 years ago
6 years ago
private bool vowel(char ch)
switch (ch)
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
case 'y':
case '<EFBFBD>':
case '<EFBFBD>':
case '<EFBFBD>': return true;
default: return false;
// R1 and R2 set up in the standard way
private int DefineR(int start)
int len = STR_SIZE;
if (start == 0)
start = 1;
if ((start < len) && (start > 0))
for (int i = start; i < len; ++i)
if ((!vowel(word_buffer[i])) && vowel(word_buffer[(i - 1)]))
return ((i - start) + start + 1);
return -1;
private void SetWord(string strWord)
// First, replace <EFBFBD> by ss
string tmp = strWord.Replace("<00>", "ss");
// adjust buffer size
while (tmp.Length > BUFFER_SIZE)
STR_SIZE = tmp.Length;
// fill in to buffer
tmp.CopyTo(0, word_buffer, 0, STR_SIZE);
private void CutEnd(int count)
STR_SIZE -= count;
6 years ago
private bool EndsWith(string end)
if (STR_SIZE > end.Length)
int stop = STR_SIZE - end.Length - 1;
int j = end.Length - 1;
for (int i = (STR_SIZE - 1); i > stop; --i)
if (word_buffer[i] != end[j])
return false;
return true;
return false;
private void preprocess()
// put u and y between vowels into upper case
for (int i = 1; i < STR_SIZE; ++i)
if (word_buffer[i] == 'u')
if (vowel(word_buffer[(i - 1)]) && vowel(word_buffer[(i + 1)]))
word_buffer[i] = 'U';
else if (word_buffer[i] == 'y')
if (vowel(word_buffer[(i - 1)]) && vowel(word_buffer[(i + 1)]))
word_buffer[i] = 'Y';
* R1 and R2 are first set up in the standard way,
* but then R1 is adjusted
6 years ago
* so that the region before it contains at least 3 letters
R1 = DefineR(0);
R2 = DefineR(R1);
if ((R1 < 3) && (R1 > -1))
R1 = 3;
* Search for the longest among the following suffixes,
6 years ago
* (a) e em en ern er es
* (b) s (preceded by a valid s-ending)
6 years ago
* and delete if in R1.
* (Of course the letter of the valid s-ending is not necessarily in R1)
* (For example, <EFBFBD>ckern -> <EFBFBD>ck, ackers -> acker, armes -> arm)
6 years ago
6 years ago
private void Step1()
if (R1 < 0)
// e em en ern er es
if (EndsWith("ern"))
if ((STR_SIZE - R1) >= 3)
else if (EndsWith("em") || EndsWith("en") || EndsWith("er") || EndsWith("es"))
if ((STR_SIZE - R1) >= 2)
else if (EndsWith("e"))
if ((STR_SIZE - R1) >= 1)
// b, d, f, g, h, k, l, m, n, r or t
else if (EndsWith("bs") || EndsWith("ds") || EndsWith("fs") ||
EndsWith("gs") || EndsWith("hs") || EndsWith("ks") ||
EndsWith("ls") || EndsWith("ms") || EndsWith("ns") ||
EndsWith("rs") || EndsWith("ts"))
if ((STR_SIZE - R1) >= 1)
* Search for the longest among the following suffixes,
6 years ago
* (a) en er est
* (b) st (preceded by a valid st-ending, itself preceded by at least 3 letters)
* and delete if in R1.
6 years ago
* (For example, derbsten -> derbst by step 1, and derbst -> derb by step 2, since b is a valid st-ending, and is preceded by just 3 letters)
6 years ago
private void Step2()
if (R1 < 0)
// en er est
if (EndsWith("est"))
if ((STR_SIZE - R1) >= 3)
else if (EndsWith("en") || EndsWith("er"))
if ((STR_SIZE - R1) >= 2)
// b, d, f, g, h, k, l, m, n or t
else if (EndsWith("bst") || EndsWith("dst") || EndsWith("fst") ||
EndsWith("gst") || EndsWith("hst") || EndsWith("kst") ||
EndsWith("lst") || EndsWith("mst") || EndsWith("nst") ||
// preceded by at least 3 letters
if (STR_SIZE > 5)
if ((STR_SIZE - R1) >= 2)
private void Step3()
if ((R2 < 0) || (R1 < 0))
* Search for the longest among the following suffixes,
* and perform the action indicated.
* end ung
* delete if in R2
* if preceded by ig, delete if in R2 and not preceded by e
6 years ago
if (EndsWith("end") || EndsWith("ung"))
if ((STR_SIZE - R2) >= 3)
if (EndsWith("ig") && (word_buffer[(STR_SIZE - 3)] != 'e'))
if ((STR_SIZE - R2) >= 2)
* ig ik isch
* delete if in R2 and not preceded by e
6 years ago
else if ((EndsWith("ig") || EndsWith("ik")) && (word_buffer[(STR_SIZE - 3)] != 'e'))
if ((STR_SIZE - R2) >= 2)
else if (EndsWith("isch") && (word_buffer[(STR_SIZE - 5)] != 'e'))
if ((STR_SIZE - R2) >= 4)
* lich heit
* delete if in R2
* if preceded by er or en, delete if in R1
6 years ago
else if (EndsWith("lich") || EndsWith("heit"))
// if preceded by er or en, delete if in R1
if (EndsWith("en") || EndsWith("er"))
if ((STR_SIZE - R1) >= 2)
STR_SIZE += 4;
STR_SIZE += 4;
if ((STR_SIZE - R2) >= 4)
* keit
* delete if in R2
* if preceded by lich or ig, delete if in R2
6 years ago
else if (EndsWith("keit"))
if ((STR_SIZE - R2) >= 4)
// if preceded by lich or ig, delete if in R2
6 years ago
if (EndsWith("ig"))
if ((STR_SIZE - R2) >= 2)
else if (EndsWith("lich"))
if ((STR_SIZE - R2) >= 4)
// Turn U and Y back into lower case,
// and remove the umlaut accent from a, o and u.
private void Finally()
for (int i = 0; i < STR_SIZE; ++i)
switch (word_buffer[i])
case '<EFBFBD>':
word_buffer[i] = 'a'; break;
case 'U':
word_buffer[i] = 'u'; break;
case '<EFBFBD>':
word_buffer[i] = 'u'; break;
case 'Y':
word_buffer[i] = 'y'; break;
case '<EFBFBD>':
word_buffer[i] = 'o'; break;
default: break;
private string Stem()
// return stemed word
return new string(word_buffer, 0, STR_SIZE);
private int counter = 0;
6 years ago
public string Lex(string word)
if (counter > 0)
throw new Exception("German stemmer is not reenterable. Create new instance");
Interlocked.Increment(ref counter);
Word = word.ToLowerInvariant();
return Stem();
Interlocked.Decrement(ref counter);
private string Word
return new string(word_buffer, 0, STR_SIZE);

Powered by TurnKey Linux.