|
|
/*
|
|
|
* Port of Snowball stemmers on C#
|
|
|
* Original stemmers can be found on http://snowball.tartarus.org
|
|
|
* Licence still BSD: http://snowball.tartarus.org/license.php
|
|
|
*
|
|
|
* Most of stemmers are ported from Java by Iveonik Systems ltd. (www.iveonik.com)
|
|
|
*
|
|
|
* German stemmer's port found on SourceForge site
|
|
|
*/
|
|
|
|
|
|
using System;
|
|
|
using System.Threading;
|
|
|
using ZeroLevel.Services.Semantic;
|
|
|
|
|
|
namespace Iveonik.Stemmers
|
|
|
{
|
|
|
public class GermanStemmer : ILexer
|
|
|
{
|
|
|
private char[] word_buffer;
|
|
|
private int STR_SIZE, R1, R2;
|
|
|
private int BUFFER_SIZE;
|
|
|
private const int INC = 20; // I found out this is optimal word string size
|
|
|
|
|
|
public GermanStemmer()
|
|
|
{
|
|
|
SetInitState();
|
|
|
}
|
|
|
|
|
|
private void SetInitState()
|
|
|
{
|
|
|
R1 = R2 = -1; STR_SIZE = 0;
|
|
|
BUFFER_SIZE = INC;
|
|
|
word_buffer = null;
|
|
|
word_buffer = new char[BUFFER_SIZE];
|
|
|
}
|
|
|
|
|
|
private void Increment()
|
|
|
{
|
|
|
char[] tmp_buffer = word_buffer;
|
|
|
BUFFER_SIZE += INC;
|
|
|
word_buffer = new char[BUFFER_SIZE];
|
|
|
tmp_buffer.CopyTo(word_buffer, 0);
|
|
|
tmp_buffer = null;
|
|
|
}
|
|
|
|
|
|
// is char vowel
|
|
|
/*
|
|
|
* The following letters are vowels:
|
|
|
* a e i o u y ä ö ü
|
|
|
*/
|
|
|
|
|
|
private bool vowel(char ch)
|
|
|
{
|
|
|
switch (ch)
|
|
|
{
|
|
|
case 'a':
|
|
|
case 'e':
|
|
|
case 'i':
|
|
|
case 'o':
|
|
|
case 'u':
|
|
|
case 'y':
|
|
|
case 'ä':
|
|
|
case 'ö':
|
|
|
case 'ü': return true;
|
|
|
default: return false;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
// R1 and R2 set up in the standard way
|
|
|
private int DefineR(int start)
|
|
|
{
|
|
|
int len = STR_SIZE;
|
|
|
if (start == 0)
|
|
|
start = 1;
|
|
|
if ((start < len) && (start > 0))
|
|
|
{
|
|
|
for (int i = start; i < len; ++i)
|
|
|
{
|
|
|
if ((!vowel(word_buffer[i])) && vowel(word_buffer[(i - 1)]))
|
|
|
return ((i - start) + start + 1);
|
|
|
}
|
|
|
}
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
private void SetWord(string strWord)
|
|
|
{
|
|
|
// First, replace ß by ss
|
|
|
string tmp = strWord.Replace("ß", "ss");
|
|
|
// adjust buffer size
|
|
|
while (tmp.Length > BUFFER_SIZE)
|
|
|
Increment();
|
|
|
STR_SIZE = tmp.Length;
|
|
|
// fill in to buffer
|
|
|
tmp.CopyTo(0, word_buffer, 0, STR_SIZE);
|
|
|
}
|
|
|
|
|
|
private void CutEnd(int count)
|
|
|
{
|
|
|
STR_SIZE -= count;
|
|
|
}
|
|
|
|
|
|
private bool EndsWith(string end)
|
|
|
{
|
|
|
if (STR_SIZE > end.Length)
|
|
|
{
|
|
|
int stop = STR_SIZE - end.Length - 1;
|
|
|
int j = end.Length - 1;
|
|
|
for (int i = (STR_SIZE - 1); i > stop; --i)
|
|
|
{
|
|
|
if (word_buffer[i] != end[j])
|
|
|
return false;
|
|
|
--j;
|
|
|
}
|
|
|
return true;
|
|
|
}
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
private void preprocess()
|
|
|
{
|
|
|
// put u and y between vowels into upper case
|
|
|
for (int i = 1; i < STR_SIZE; ++i)
|
|
|
{
|
|
|
if (word_buffer[i] == 'u')
|
|
|
{
|
|
|
if (vowel(word_buffer[(i - 1)]) && vowel(word_buffer[(i + 1)]))
|
|
|
word_buffer[i] = 'U';
|
|
|
}
|
|
|
else if (word_buffer[i] == 'y')
|
|
|
{
|
|
|
if (vowel(word_buffer[(i - 1)]) && vowel(word_buffer[(i + 1)]))
|
|
|
word_buffer[i] = 'Y';
|
|
|
}
|
|
|
}
|
|
|
/*
|
|
|
* R1 and R2 are first set up in the standard way,
|
|
|
* but then R1 is adjusted
|
|
|
* so that the region before it contains at least 3 letters
|
|
|
*/
|
|
|
R1 = DefineR(0);
|
|
|
R2 = DefineR(R1);
|
|
|
if ((R1 < 3) && (R1 > -1))
|
|
|
R1 = 3;
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
* Search for the longest among the following suffixes,
|
|
|
* (a) e em en ern er es
|
|
|
* (b) s (preceded by a valid s-ending)
|
|
|
* and delete if in R1.
|
|
|
* (Of course the letter of the valid s-ending is not necessarily in R1)
|
|
|
*
|
|
|
* (For example, äckern -> äck, ackers -> acker, armes -> arm)
|
|
|
*/
|
|
|
|
|
|
private void Step1()
|
|
|
{
|
|
|
if (R1 < 0)
|
|
|
return;
|
|
|
// e em en ern er es
|
|
|
if (EndsWith("ern"))
|
|
|
{
|
|
|
if ((STR_SIZE - R1) >= 3)
|
|
|
CutEnd(3);
|
|
|
}
|
|
|
else if (EndsWith("em") || EndsWith("en") || EndsWith("er") || EndsWith("es"))
|
|
|
{
|
|
|
if ((STR_SIZE - R1) >= 2)
|
|
|
CutEnd(2);
|
|
|
}
|
|
|
else if (EndsWith("e"))
|
|
|
{
|
|
|
if ((STR_SIZE - R1) >= 1)
|
|
|
CutEnd(1);
|
|
|
}
|
|
|
// b, d, f, g, h, k, l, m, n, r or t
|
|
|
else if (EndsWith("bs") || EndsWith("ds") || EndsWith("fs") ||
|
|
|
EndsWith("gs") || EndsWith("hs") || EndsWith("ks") ||
|
|
|
EndsWith("ls") || EndsWith("ms") || EndsWith("ns") ||
|
|
|
EndsWith("rs") || EndsWith("ts"))
|
|
|
{
|
|
|
if ((STR_SIZE - R1) >= 1)
|
|
|
CutEnd(1);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
* Search for the longest among the following suffixes,
|
|
|
* (a) en er est
|
|
|
* (b) st (preceded by a valid st-ending, itself preceded by at least 3 letters)
|
|
|
* and delete if in R1.
|
|
|
*
|
|
|
* (For example, derbsten -> derbst by step 1, and derbst -> derb by step 2, since b is a valid st-ending, and is preceded by just 3 letters)
|
|
|
*/
|
|
|
|
|
|
private void Step2()
|
|
|
{
|
|
|
if (R1 < 0)
|
|
|
return;
|
|
|
// en er est
|
|
|
if (EndsWith("est"))
|
|
|
{
|
|
|
if ((STR_SIZE - R1) >= 3)
|
|
|
CutEnd(3);
|
|
|
}
|
|
|
else if (EndsWith("en") || EndsWith("er"))
|
|
|
{
|
|
|
if ((STR_SIZE - R1) >= 2)
|
|
|
CutEnd(2);
|
|
|
}
|
|
|
// b, d, f, g, h, k, l, m, n or t
|
|
|
else if (EndsWith("bst") || EndsWith("dst") || EndsWith("fst") ||
|
|
|
EndsWith("gst") || EndsWith("hst") || EndsWith("kst") ||
|
|
|
EndsWith("lst") || EndsWith("mst") || EndsWith("nst") ||
|
|
|
EndsWith("tst"))
|
|
|
{
|
|
|
// preceded by at least 3 letters
|
|
|
if (STR_SIZE > 5)
|
|
|
{
|
|
|
if ((STR_SIZE - R1) >= 2)
|
|
|
CutEnd(2);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
private void Step3()
|
|
|
{
|
|
|
if ((R2 < 0) || (R1 < 0))
|
|
|
return;
|
|
|
/*
|
|
|
* Search for the longest among the following suffixes,
|
|
|
* and perform the action indicated.
|
|
|
* end ung
|
|
|
* delete if in R2
|
|
|
* if preceded by ig, delete if in R2 and not preceded by e
|
|
|
*/
|
|
|
if (EndsWith("end") || EndsWith("ung"))
|
|
|
{
|
|
|
if ((STR_SIZE - R2) >= 3)
|
|
|
CutEnd(3);
|
|
|
if (EndsWith("ig") && (word_buffer[(STR_SIZE - 3)] != 'e'))
|
|
|
{
|
|
|
if ((STR_SIZE - R2) >= 2)
|
|
|
CutEnd(2);
|
|
|
}
|
|
|
}
|
|
|
/*
|
|
|
* ig ik isch
|
|
|
* delete if in R2 and not preceded by e
|
|
|
*/
|
|
|
else if ((EndsWith("ig") || EndsWith("ik")) && (word_buffer[(STR_SIZE - 3)] != 'e'))
|
|
|
{
|
|
|
if ((STR_SIZE - R2) >= 2)
|
|
|
CutEnd(2);
|
|
|
}
|
|
|
else if (EndsWith("isch") && (word_buffer[(STR_SIZE - 5)] != 'e'))
|
|
|
{
|
|
|
if ((STR_SIZE - R2) >= 4)
|
|
|
CutEnd(4);
|
|
|
}
|
|
|
/*
|
|
|
* lich heit
|
|
|
* delete if in R2
|
|
|
* if preceded by er or en, delete if in R1
|
|
|
*/
|
|
|
else if (EndsWith("lich") || EndsWith("heit"))
|
|
|
{
|
|
|
CutEnd(4);
|
|
|
// if preceded by er or en, delete if in R1
|
|
|
if (EndsWith("en") || EndsWith("er"))
|
|
|
{
|
|
|
if ((STR_SIZE - R1) >= 2)
|
|
|
CutEnd(2);
|
|
|
else
|
|
|
STR_SIZE += 4;
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
STR_SIZE += 4;
|
|
|
if ((STR_SIZE - R2) >= 4)
|
|
|
CutEnd(4);
|
|
|
}
|
|
|
}
|
|
|
/*
|
|
|
* keit
|
|
|
* delete if in R2
|
|
|
* if preceded by lich or ig, delete if in R2
|
|
|
*/
|
|
|
else if (EndsWith("keit"))
|
|
|
{
|
|
|
if ((STR_SIZE - R2) >= 4)
|
|
|
CutEnd(4);
|
|
|
// if preceded by lich or ig, delete if in R2
|
|
|
if (EndsWith("ig"))
|
|
|
{
|
|
|
if ((STR_SIZE - R2) >= 2)
|
|
|
CutEnd(2);
|
|
|
}
|
|
|
else if (EndsWith("lich"))
|
|
|
{
|
|
|
if ((STR_SIZE - R2) >= 4)
|
|
|
CutEnd(4);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
// Turn U and Y back into lower case,
|
|
|
// and remove the umlaut accent from a, o and u.
|
|
|
private void Finally()
|
|
|
{
|
|
|
for (int i = 0; i < STR_SIZE; ++i)
|
|
|
{
|
|
|
switch (word_buffer[i])
|
|
|
{
|
|
|
case 'ä':
|
|
|
word_buffer[i] = 'a'; break;
|
|
|
case 'U':
|
|
|
word_buffer[i] = 'u'; break;
|
|
|
case 'ü':
|
|
|
word_buffer[i] = 'u'; break;
|
|
|
case 'Y':
|
|
|
word_buffer[i] = 'y'; break;
|
|
|
case 'ö':
|
|
|
word_buffer[i] = 'o'; break;
|
|
|
default: break;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
private string Stem()
|
|
|
{
|
|
|
preprocess();
|
|
|
Step1();
|
|
|
Step2();
|
|
|
Step3();
|
|
|
Finally();
|
|
|
// return stemed word
|
|
|
return new string(word_buffer, 0, STR_SIZE);
|
|
|
}
|
|
|
|
|
|
private int counter = 0;
|
|
|
|
|
|
public string Lex(string word)
|
|
|
{
|
|
|
if (counter > 0)
|
|
|
throw new Exception("German stemmer is not reenterable. Create new instance");
|
|
|
Interlocked.Increment(ref counter);
|
|
|
try
|
|
|
{
|
|
|
Word = word.ToLowerInvariant();
|
|
|
return Stem();
|
|
|
}
|
|
|
finally
|
|
|
{
|
|
|
Interlocked.Decrement(ref counter);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
private string Word
|
|
|
{
|
|
|
get
|
|
|
{
|
|
|
return new string(word_buffer, 0, STR_SIZE);
|
|
|
}
|
|
|
set
|
|
|
{
|
|
|
SetInitState();
|
|
|
SetWord(value);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
} |