Append semantic service

pull/1/head
a.bozhenov 5 years ago
parent 6c3bc6a0b9
commit 13cb6c6b0e

4
.gitignore vendored

@ -27,6 +27,8 @@ bower_components
*.sbr
*.scc
*.dbmdl
*.nupkg
*.p7s
*.sln.ide
[Bb]in
[Dd]ebug*/
@ -41,6 +43,8 @@ TestsResult.html
# NuGet
/packages/
**/packages/*
!**/packages/build/
# Build
/artifacts/

@ -0,0 +1,381 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.Serialization;
using System.Text;
namespace LemmaSharp
{
[Serializable]
public class ExampleList : ISerializable
{
#region Private Variables
private LemmatizerSettings lsett;
private RuleList rlRules;
private Dictionary<string, LemmaExample> dictExamples;
private List<LemmaExample> lstExamples;
#endregion
#region Constructor(s)
public ExampleList(LemmatizerSettings lsett) : base()
{
this.lsett = lsett;
this.dictExamples = new Dictionary<string, LemmaExample>();
this.lstExamples = null;
this.rlRules = new RuleList(lsett);
}
public ExampleList(StreamReader srIn, string sFormat, LemmatizerSettings lsett) : this(lsett)
{
AddMultextFile(srIn, sFormat);
}
#endregion
#region Public Properties & Indexers
public LemmaExample this[int i]
{
get
{
if (lstExamples == null)
FinalizeAdditions();
return lstExamples[i];
}
}
public int Count
{
get
{
if (lstExamples == null)
FinalizeAdditions();
return lstExamples.Count;
}
}
public double WeightSum
{
get
{
if (lstExamples == null)
FinalizeAdditions();
double dWeight = 0;
foreach (LemmaExample exm in lstExamples)
dWeight += exm.Weight;
return dWeight;
}
}
public RuleList Rules
{
get
{
return rlRules;
}
}
public List<LemmaExample> ListExamples
{
get
{
if (lstExamples == null)
FinalizeAdditions();
return lstExamples;
}
}
#endregion
#region Essential Class Functions (adding/removing examples)
public void AddMultextFile(StreamReader srIn, string sFormat)
{
//read from file
string sLine = null;
int iError = 0;
int iLine = 0;
var iW = sFormat.IndexOf('W');
var iL = sFormat.IndexOf('L');
var iM = sFormat.IndexOf('M');
var iF = sFormat.IndexOf('F');
var iLen = Math.Max(Math.Max(iW, iL), Math.Max(iM, iF)) + 1;
if (iW < 0 || iL < 0)
{
throw new Exception("Can not find word and lemma location in the format specification");
}
while ((sLine = srIn.ReadLine()) != null && iError < 50)
{
iLine++;
string[] asWords = sLine.Split(new char[] { '\t' });
if (asWords.Length < iLen)
{
//Console.WriteLine("ERROR: Line doesn't confirm to the given format \"" + sFormat + "\"! Line " + iLine.ToString() + ".");
iError++;
continue;
}
var sWord = asWords[iW];
var sLemma = asWords[iL];
if (sLemma.Equals("=", StringComparison.Ordinal))
sLemma = sWord;
string sMsd = null;
if (iM > -1)
sMsd = asWords[iM];
double dWeight = 1; ;
if (iF > -1)
Double.TryParse(asWords[iM], out dWeight);
AddExample(sWord, sLemma, dWeight, sMsd);
}
if (iError == 50)
throw new Exception("Parsing stopped because of too many (50) errors. Check format specification");
}
public LemmaExample AddExample(string sWord, string sLemma, double dWeight, string sMsd)
{
string sNewMsd = lsett.eMsdConsider != LemmatizerSettings.MsdConsideration.Ignore
? sMsd
: null;
var leNew = new LemmaExample(sWord, sLemma, dWeight, sNewMsd, rlRules, lsett);
return Add(leNew);
}
private LemmaExample Add(LemmaExample leNew)
{
LemmaExample leReturn = null;
if (!dictExamples.TryGetValue(leNew.Signature, out leReturn))
{
leReturn = leNew;
dictExamples.Add(leReturn.Signature, leReturn);
}
else
leReturn.Join(leNew);
lstExamples = null;
return leReturn;
}
public void DropExamples()
{
dictExamples.Clear();
lstExamples = null;
}
public void FinalizeAdditions()
{
if (lstExamples != null)
return;
lstExamples = new List<LemmaExample>(dictExamples.Values);
lstExamples.Sort();
}
public ExampleList GetFrontRearExampleList(bool front)
{
var elExamplesNew = new ExampleList(lsett);
foreach (var le in this.ListExamples)
{
if (front)
elExamplesNew.AddExample(le.WordFront, le.LemmaFront, le.Weight, le.Msd);
else
elExamplesNew.AddExample(le.WordRear, le.LemmaRear, le.Weight, le.Msd);
}
elExamplesNew.FinalizeAdditions();
return elExamplesNew;
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
var sb = new StringBuilder();
foreach (var exm in lstExamples)
{
sb.AppendLine(exm.ToString());
}
return sb.ToString();
}
#endregion
#region Serialization Functions (.Net Default - ISerializable)
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
info.AddValue("lsett", lsett);
info.AddValue("iNumExamples", dictExamples.Count);
var aWords = new string[dictExamples.Count];
var aLemmas = new string[dictExamples.Count];
var aWeights = new double[dictExamples.Count];
var aMsds = new string[dictExamples.Count];
int iExm = 0;
foreach (var exm in dictExamples.Values)
{
aWords[iExm] = exm.Word;
aLemmas[iExm] = exm.Lemma;
aWeights[iExm] = exm.Weight;
aMsds[iExm] = exm.Msd;
iExm++;
}
info.AddValue("aWords", aWords);
info.AddValue("aLemmas", aLemmas);
info.AddValue("aWeights", aWeights);
info.AddValue("aMsds", aMsds);
}
public ExampleList(SerializationInfo info, StreamingContext context)
{
lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings));
this.dictExamples = new Dictionary<string, LemmaExample>();
this.lstExamples = null;
this.rlRules = new RuleList(lsett);
var aWords = (string[])info.GetValue("aWords", typeof(string[]));
var aLemmas = (string[])info.GetValue("aLemmas", typeof(string[]));
var aWeights = (double[])info.GetValue("aWeights", typeof(double[]));
var aMsds = (string[])info.GetValue("aMsds", typeof(string[]));
for (int iExm = 0; iExm < aWords.Length; iExm++)
AddExample(aWords[iExm], aLemmas[iExm], aWeights[iExm], aMsds[iExm]);
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bSerializeExamples, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Serialize(binWrt);
rlRules.Serialize(binWrt, false);
if (!bSerializeExamples)
{
binWrt.Write(false); // lstExamples == null
binWrt.Write(0); // dictExamples.Count == 0
}
else
{
if (lstExamples == null)
{
binWrt.Write(false); // lstExamples == null
//save dictionary items
int iCount = dictExamples.Count;
binWrt.Write(iCount);
foreach (var kvp in dictExamples)
{
binWrt.Write(kvp.Value.Rule.Signature);
kvp.Value.Serialize(binWrt, false);
}
}
else
{
binWrt.Write(true); // lstExamples != null
//save list & dictionary items
var iCount = lstExamples.Count;
binWrt.Write(iCount);
foreach (var le in lstExamples)
{
binWrt.Write(le.Rule.Signature);
le.Serialize(binWrt, false);
}
}
}
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
rlRules = new RuleList(binRead, this.lsett);
var bCreateLstExamples = binRead.ReadBoolean();
lstExamples = bCreateLstExamples ? new List<LemmaExample>() : null;
dictExamples = new Dictionary<string, LemmaExample>();
//load dictionary items
var iCount = binRead.ReadInt32();
for (var iId = 0; iId < iCount; iId++)
{
var lrRule = rlRules[binRead.ReadString()];
var le = new LemmaExample(binRead, this.lsett, lrRule);
dictExamples.Add(le.Signature, le);
if (bCreateLstExamples)
lstExamples.Add(le);
}
}
public ExampleList(BinaryReader binRead, LemmatizerSettings lsett)
{
Deserialize(binRead, lsett);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bSerializeExamples, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Save(binWrt);
rlRules.Save(binWrt, false);
if (!bSerializeExamples) {
binWrt.WriteBool(false); // lstExamples == null
binWrt.WriteInt(0); // dictExamples.Count == 0
}
else {
if (lstExamples == null) {
binWrt.WriteBool(false); // lstExamples == null
//save dictionary items
int iCount = dictExamples.Count;
binWrt.WriteInt(iCount);
foreach (KeyValuePair<string, LemmaExample> kvp in dictExamples) {
binWrt.WriteString(kvp.Value.Rule.Signature);
kvp.Value.Save(binWrt, false);
}
}
else {
binWrt.WriteBool(true); // lstExamples != null
//save list & dictionary items
int iCount = lstExamples.Count;
binWrt.WriteInt(iCount);
foreach (LemmaExample le in lstExamples) {
binWrt.WriteString(le.Rule.Signature);
le.Save(binWrt, false);
}
}
}
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
rlRules = new RuleList(binRead, this.lsett);
bool bCreateLstExamples = binRead.ReadBool();
lstExamples = bCreateLstExamples ? new List<LemmaExample>() : null;
dictExamples = new Dictionary<string, LemmaExample>();
//load dictionary items
int iCount = binRead.ReadInt();
for (int iId = 0; iId < iCount; iId++) {
LemmaRule lrRule = rlRules[binRead.ReadString()];
LemmaExample le = new LemmaExample(binRead, this.lsett, lrRule);
dictExamples.Add(le.Signature, le);
if (bCreateLstExamples) lstExamples.Add(le);
}
}
public ExampleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
#endregion
}
}

@ -0,0 +1,481 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace LemmaSharp
{
public class LemmaExample : IComparable<LemmaExample>, IComparer<LemmaExample>
{
#region Private Variables
private string sWord;
private string sLemma;
private string sSignature;
private string sMsd;
private double dWeight;
private LemmaRule lrRule;
private LemmatizerSettings lsett;
private string sWordRearCache;
private string sWordFrontCache;
private string sLemmaFrontCache;
#endregion
#region Constructor(s)
public LemmaExample(string sWord, string sLemma, double dWeight, string sMsd, RuleList rlRules, LemmatizerSettings lsett)
{
this.lsett = lsett;
this.sWord = sWord;
this.sLemma = sLemma;
this.sMsd = sMsd;
this.dWeight = dWeight;
this.lrRule = rlRules.AddRule(this);
switch (lsett.eMsdConsider)
{
case LemmatizerSettings.MsdConsideration.Ignore:
case LemmatizerSettings.MsdConsideration.JoinAll:
case LemmatizerSettings.MsdConsideration.JoinDistinct:
case LemmatizerSettings.MsdConsideration.JoinSameSubstring:
sSignature = string.Format("[{0}]==>[{1}]", sWord, sLemma);
break;
case LemmatizerSettings.MsdConsideration.Distinct:
default:
sSignature = string.Format("[{0}]==>[{1}]({2})", sWord, sLemma, sMsd ?? "");
break;
}
this.sWordRearCache = null;
this.sWordFrontCache = null;
this.sLemmaFrontCache = null;
}
#endregion
#region Public Properties
public string Word
{
get
{
return sWord;
}
}
public string Lemma
{
get
{
return sLemma;
}
}
public string Msd
{
get
{
return sMsd;
}
}
public string Signature
{
get
{
return sSignature;
}
}
public double Weight
{
get
{
return dWeight;
}
}
public LemmaRule Rule
{
get
{
return lrRule;
}
}
/// <summary>
/// Word to be pre-lemmatized with Front-Lemmatizer into LemmaFront which is then lemmatized by standard Rear-Lemmatizer (Warning it is reversed)
/// </summary>
public string WordFront
{
get
{
if (sWordFrontCache == null)
sWordFrontCache = StringReverse(sWord);
return sWordFrontCache;
}
}
/// <summary>
/// Lemma to be produced by pre-lemmatizing with Front-Lemmatizer (Warning it is reversed)
/// </summary>
public string LemmaFront
{
get
{
if (sLemmaFrontCache == null)
sLemmaFrontCache = StringReverse(WordRear);
return sLemmaFrontCache;
}
}
/// <summary>
/// word to be lemmatized by standard Rear-Lemmatizer (it's beggining has been already modified by Front-Lemmatizer)
/// </summary>
public string WordRear
{
get
{
if (sWordRearCache == null)
{
int lemmaPos = 0, wordPos = 0;
var common = LongestCommonSubstring(sWord, sLemma, ref wordPos, ref lemmaPos);
sWordRearCache = lemmaPos == -1 ? sLemma : (sLemma.Substring(0, lemmaPos + common.Length) + sWord.Substring(wordPos + common.Length));
}
return sWordRearCache;
}
}
/// <summary>
/// lemma to be produced by standard Rear-Lemmatizer from WordRear
/// </summary>
public string LemmaRear
{
get
{
return sLemma;
}
}
#endregion
#region Essential Class Functions (joining two examples into one)
//TODO - this function is not totaly ok because sMsd should not be
//changed since it could be included in signature
public void Join(LemmaExample leJoin)
{
dWeight += leJoin.dWeight;
if (sMsd != null)
switch (lsett.eMsdConsider)
{
case LemmatizerSettings.MsdConsideration.Ignore:
sMsd = null;
break;
case LemmatizerSettings.MsdConsideration.Distinct:
break;
case LemmatizerSettings.MsdConsideration.JoinAll:
sMsd += "|" + leJoin.sMsd;
break;
case LemmatizerSettings.MsdConsideration.JoinDistinct:
var append = string.Format("|{0}", leJoin.sMsd);
if (false == sMsd.Equals(leJoin.sMsd, StringComparison.Ordinal) &&
sMsd.IndexOf(append) < 0)
{
sMsd += append;
}
break;
case LemmatizerSettings.MsdConsideration.JoinSameSubstring:
int iPos = 0;
var iMax = Math.Min(sMsd.Length, leJoin.sMsd.Length);
while (iPos < iMax && sMsd[iPos] == leJoin.sMsd[iPos])
iPos++;
sMsd = sMsd.Substring(0, iPos);
break;
default:
break;
}
}
#endregion
#region Essential Class Functions (calculating similarities betwen examples)
public int Similarity(LemmaExample le)
{
return Similarity(this, le);
}
public static int Similarity(LemmaExample le1, LemmaExample le2)
{
var sWord1 = le1.sWord;
var sWord2 = le2.sWord;
var iLen1 = sWord1.Length;
var iLen2 = sWord2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
for (var iPos = 1; iPos <= iMaxLen; iPos++)
{
if (sWord1[iLen1 - iPos] != sWord2[iLen2 - iPos])
return iPos - 1;
}
//TODO similarity should be bigger if two words are totaly equal
//if (sWord1 == sWord2)
// return iMaxLen + 1;
//else
return iMaxLen;
}
#endregion
#region Essential Class Functions (comparing examples - eg.: for sorting)
/// <summary>
/// Function used to comprare current MultextExample (ME) against argument ME.
/// Mainly used in for sorting lists of MEs.
/// </summary>
/// <param name="other"> MultextExample (ME) that we compare current ME against.</param>
/// <returns>1 if current ME is bigger, -1 if smaler and 0 if both are the same.</returns>
public int CompareTo(LemmaExample other)
{
var iComparison = CompareStrings(this.sWord, other.sWord, false);
if (iComparison != 0)
return iComparison;
iComparison = CompareStrings(this.sLemma, other.sLemma, true);
if (iComparison != 0)
return iComparison;
if (lsett.eMsdConsider == LemmatizerSettings.MsdConsideration.Distinct &&
this.sMsd != null && other.sMsd != null)
{
iComparison = CompareStrings(this.sMsd, other.sMsd, true);
if (iComparison != 0)
return iComparison;
}
return 0;
}
public int Compare(LemmaExample x, LemmaExample y)
{
return x.CompareTo(y);
}
public static int CompareStrings(string sStr1, string sStr2, bool bForward)
{
var iLen1 = sStr1.Length;
var iLen2 = sStr2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
if (bForward)
{
for (int iPos = 0; iPos < iMaxLen; iPos++)
{
if (sStr1[iPos] > sStr2[iPos])
return 1;
if (sStr1[iPos] < sStr2[iPos])
return -1;
}
}
else
{
for (int iPos = 1; iPos <= iMaxLen; iPos++)
{
if (sStr1[iLen1 - iPos] > sStr2[iLen2 - iPos])
return 1;
if (sStr1[iLen1 - iPos] < sStr2[iLen2 - iPos])
return -1;
}
}
if (iLen1 > iLen2)
return 1;
if (iLen1 < iLen2)
return -1;
return 0;
}
public static int EqualPrifixLen(string sStr1, string sStr2)
{
var iLen1 = sStr1.Length;
var iLen2 = sStr2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
for (var iPos = 0; iPos < iMaxLen; iPos++)
{
if (sStr1[iPos] != sStr2[iPos])
return iPos;
}
return iMaxLen;
}
public static string LongestCommonSubstring(string sStr1, string sStr2, ref int iPosInStr1, ref int iPosInStr2)
{
var l = new int[sStr1.Length + 1, sStr2.Length + 1];
int z = 0;
string ret = "";
iPosInStr1 = -1;
iPosInStr2 = -1;
for (var i = 0; i < sStr1.Length; i++)
{
for (var j = 0; j < sStr2.Length; j++)
{
if (sStr1[i] == sStr2[j])
{
if (i == 0 || j == 0)
{
l[i, j] = 1;
}
else
{
l[i, j] = l[i - 1, j - 1] + 1;
}
if (l[i, j] > z)
{
z = l[i, j];
iPosInStr1 = i - z + 1;
iPosInStr2 = j - z + 1;
ret = sStr1.Substring(i - z + 1, z);
}
}
}
}
return ret;
}
public static string StringReverse(string s)
{
if (s == null)
return null;
var charArray = new char[s.Length];
var len = s.Length >> 1;
for (var i = 0; i < len; i++, len--)
{
charArray[i] = s[len];
charArray[len] = s[i];
}
return new string(charArray);
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
var sb = new StringBuilder();
if (sWord != null)
sb.AppendFormat("W:\"{0}\" ", sWord);
if (sLemma != null)
sb.AppendFormat("L:\"{0}\" ", sLemma);
if (sMsd != null)
sb.AppendFormat("M:\"{0}\" ", sMsd);
if (false == Double.IsNaN(dWeight))
sb.AppendFormat("F:\"{0}\" ", dWeight);
if (lrRule != null)
sb.AppendFormat("R:{0} ", lrRule);
if (sb.Length > 0)
return sb.ToString(0, sb.Length - 1);
return string.Empty;
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
binWrt.Write(sWord);
binWrt.Write(sLemma);
binWrt.Write(sSignature);
if (sMsd == null)
{
binWrt.Write(false);
}
else
{
binWrt.Write(true);
binWrt.Write(sMsd);
}
binWrt.Write(dWeight);
//save refernce types if needed -------------------------
if (bThisTopObject)
{
lsett.Serialize(binWrt);
lrRule.Serialize(binWrt, false);
}
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, LemmaRule lrRule)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
sWord = binRead.ReadString();
sLemma = binRead.ReadString();
sSignature = binRead.ReadString();
if (binRead.ReadBoolean())
sMsd = binRead.ReadString();
else
sMsd = null;
dWeight = binRead.ReadDouble();
//load refernce types if needed -------------------------
if (bThisTopObject)
{
this.lsett = new LemmatizerSettings(binRead);
this.lrRule = new LemmaRule(binRead, this.lsett);
}
else
{
this.lsett = lsett;
this.lrRule = lrRule;
}
this.sWordRearCache = null;
this.sWordFrontCache = null;
this.sLemmaFrontCache = null;
}
public LemmaExample(BinaryReader binRead, LemmatizerSettings lsett, LemmaRule lrRule)
{
Deserialize(binRead, lsett, lrRule);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
binWrt.WriteString(sWord);
binWrt.WriteString(sLemma);
binWrt.WriteString(sSignature);
if (sMsd == null)
binWrt.WriteBool(false);
else {
binWrt.WriteBool(true);
binWrt.WriteString(sMsd);
}
binWrt.WriteDouble(dWeight);
//save refernce types if needed -------------------------
if (bThisTopObject) {
lsett.Save(binWrt);
lrRule.Save(binWrt, false);
}
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
sWord = binRead.ReadString();
sLemma = binRead.ReadString();
sSignature = binRead.ReadString();
if (binRead.ReadBool())
sMsd = binRead.ReadString();
else
sMsd = null;
dWeight = binRead.ReadDouble();
//load refernce types if needed -------------------------
if (bThisTopObject) {
this.lsett = new LemmatizerSettings(binRead);
this.lrRule = new LemmaRule(binRead, this.lsett);
}
else {
this.lsett = lsett;
this.lrRule = lrRule;
}
}
public LemmaExample(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule) {
Load(binRead, lsett, lrRule);
}
#endif
#endregion
}
}

@ -0,0 +1,189 @@
using System;
using System.IO;
namespace LemmaSharp
{
public class LemmaRule
{
#region Private Variables
private int iId;
private int iFrom;
private string sFrom;
private string sTo;
private string sSignature;
private LemmatizerSettings lsett;
#endregion
#region Constructor(s)
public LemmaRule(string sWord, string sLemma, int iId, LemmatizerSettings lsett)
{
this.lsett = lsett;
this.iId = iId;
int iSameStem = SameStem(sWord, sLemma);
sTo = sLemma.Substring(iSameStem);
iFrom = sWord.Length - iSameStem;
if (lsett.bUseFromInRules)
{
sFrom = sWord.Substring(iSameStem);
sSignature = string.Format("[{0}]==>[{1}]", sFrom, sTo);
}
else
{
sFrom = null;
sSignature = string.Format("[#{0}]==>[{1}]", iFrom, sTo);
}
}
#endregion
#region Public Properties
public string Signature
{
get
{
return sSignature;
}
}
public int Id
{
get
{
return iId;
}
}
#endregion
#region Essential Class Functions
private static int SameStem(string sStr1, string sStr2)
{
var iLen1 = sStr1.Length;
var iLen2 = sStr2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
for (var iPos = 0; iPos < iMaxLen; iPos++)
{
if (sStr1[iPos] != sStr2[iPos])
return iPos;
}
return iMaxLen;
}
public bool IsApplicableToGroup(int iGroupCondLen)
{
return iGroupCondLen >= iFrom;
}
public string Lemmatize(string sWord)
{
return sWord.Substring(0, sWord.Length - iFrom) + sTo;
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
return string.Format("{0}:{1}", iId, sSignature);
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
binWrt.Write(iId);
binWrt.Write(iFrom);
if (sFrom == null)
binWrt.Write(false);
else
{
binWrt.Write(true);
binWrt.Write(sFrom);
}
binWrt.Write(sTo);
binWrt.Write(sSignature);
if (bThisTopObject)
lsett.Serialize(binWrt);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
iId = binRead.ReadInt32();
iFrom = binRead.ReadInt32();
if (binRead.ReadBoolean())
{
sFrom = binRead.ReadString();
}
else
{
sFrom = null;
}
sTo = binRead.ReadString();
sSignature = binRead.ReadString();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
}
public LemmaRule(System.IO.BinaryReader binRead, LemmatizerSettings lsett)
{
this.Deserialize(binRead, lsett);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
binWrt.WriteInt(iId);
binWrt.WriteInt(iFrom);
if (sFrom == null)
binWrt.WriteBool(false);
else {
binWrt.WriteBool(true);
binWrt.WriteString(sFrom);
}
binWrt.WriteString(sTo);
binWrt.WriteString(sSignature);
if (bThisTopObject)
lsett.Save(binWrt);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
iId = binRead.ReadInt();
iFrom = binRead.ReadInt();
if (binRead.ReadBool())
sFrom = binRead.ReadString();
else
sFrom = null;
sTo = binRead.ReadString();
sSignature = binRead.ReadString();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
}
public LemmaRule(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
#endregion
}
}

@ -0,0 +1,478 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace LemmaSharp
{
[Serializable]
public class LemmaTreeNode : ILemmatizerModel
{
#region Private Variables
//settings
private LemmatizerSettings lsett;
//tree structure references
private Dictionary<char, LemmaTreeNode> dictSubNodes;
private LemmaTreeNode ltnParentNode;
//essential node properties
private int iSimilarity; //similarity among all words in this node
private string sCondition; //suffix that must match in order to lemmatize
private bool bWholeWord; //true if condition has to match to whole word
//rules and weights;
private LemmaRule lrBestRule; //the best rule to be applied when lemmatizing
private RuleWeighted[] aBestRules; //list of best rules
private double dWeight;
//source of this node
private int iStart;
private int iEnd;
private ExampleList elExamples;
#endregion
#region Constructor(s) & Destructor(s)
private LemmaTreeNode(LemmatizerSettings lsett)
{
this.lsett = lsett;
}
public LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples)
: this(lsett, elExamples, 0, elExamples.Count - 1, null)
{
}
/// <summary>
///
/// </summary>
/// <param name="lsett"></param>
/// <param name="elExamples"></param>
/// <param name="iStart">Index of the first word of the current group</param>
/// <param name="iEnd">Index of the last word of the current group</param>
/// <param name="ltnParentNode"></param>
private LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples, int iStart, int iEnd, LemmaTreeNode ltnParentNode) : this(lsett)
{
this.ltnParentNode = ltnParentNode;
this.dictSubNodes = null;
this.iStart = iStart;
this.iEnd = iEnd;
this.elExamples = elExamples;
if (iStart >= elExamples.Count || iEnd >= elExamples.Count || iStart > iEnd)
{
lrBestRule = elExamples.Rules.DefaultRule;
aBestRules = new RuleWeighted[1];
aBestRules[0] = new RuleWeighted(lrBestRule, 0);
dWeight = 0;
return;
}
int iConditionLength = Math.Min(ltnParentNode == null ? 0 : ltnParentNode.iSimilarity + 1, elExamples[iStart].Word.Length);
this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - iConditionLength);
this.iSimilarity = elExamples[iStart].Similarity(elExamples[iEnd]);
this.bWholeWord = ltnParentNode == null ? false : elExamples[iEnd].Word.Length == ltnParentNode.iSimilarity;
FindBestRules();
AddSubAll();
//TODO check this heuristics, can be problematic when there are more applicable rules
if (dictSubNodes != null)
{
var lReplaceNodes = new List<KeyValuePair<char, LemmaTreeNode>>();
foreach (var kvpChild in dictSubNodes)
if (kvpChild.Value.dictSubNodes != null && kvpChild.Value.dictSubNodes.Count == 1)
{
var enumChildChild = kvpChild.Value.dictSubNodes.Values.GetEnumerator();
enumChildChild.MoveNext();
var ltrChildChild = enumChildChild.Current;
if (kvpChild.Value.lrBestRule == lrBestRule)
lReplaceNodes.Add(new KeyValuePair<char, LemmaTreeNode>(kvpChild.Key, ltrChildChild));
}
foreach (var kvpChild in lReplaceNodes)
{
dictSubNodes[kvpChild.Key] = kvpChild.Value;
kvpChild.Value.ltnParentNode = this;
}
}
}
#endregion
#region Public Properties
public int TreeSize
{
get
{
int iCount = 1;
if (dictSubNodes != null)
{
foreach (var ltnChild in dictSubNodes.Values)
{
iCount += ltnChild.TreeSize;
}
}
return iCount;
}
}
public double Weight
{
get
{
return dWeight;
}
}
#endregion
#region Essential Class Functions (building model)
private void FindBestRules()
{
/*
* LINQ SPEED TEST (Slower than current metodology)
*
List<LemmaExample> leApplicable = new List<LemmaExample>();
for (int iExm = iStart; iExm <= iEnd; iExm++)
if (elExamples[iExm].Rule.IsApplicableToGroup(sCondition.Length))
leApplicable.Add(elExamples[iExm]);
List<KeyValuePair<LemmaRule, double>> lBestRules = new List<KeyValuePair<LemmaRule,double>>();
lBestRules.AddRange(
leApplicable.
GroupBy<LemmaExample, LemmaRule, double, KeyValuePair<LemmaRule, double>>(
le => le.Rule,
le => le.Weight,
(lr, enumDbl) => new KeyValuePair<LemmaRule, double>(lr, enumDbl.Aggregate((acc, curr) => acc + curr))
).
OrderBy(kvpLrWght=>kvpLrWght.Value)
);
if (lBestRules.Count > 0)
lrBestRule = lBestRules[0].Key;
else {
lrBestRule = elExamples.Rules.DefaultRule;
}
*/
dWeight = 0;
//calculate dWeight of whole node and calculates qualities for all rules
var dictApplicableRules = new Dictionary<LemmaRule, double>();
//dictApplicableRules.Add(elExamples.Rules.DefaultRule, 0);
while (dictApplicableRules.Count == 0)
{
for (var iExm = iStart; iExm <= iEnd; iExm++)
{
var lr = elExamples[iExm].Rule;
var dExmWeight = elExamples[iExm].Weight;
dWeight += dExmWeight;
if (lr.IsApplicableToGroup(sCondition.Length))
{
if (dictApplicableRules.ContainsKey(lr))
dictApplicableRules[lr] += dExmWeight;
else
dictApplicableRules.Add(lr, dExmWeight);
}
}
//if none found then increase condition length or add some default appliable rule
if (dictApplicableRules.Count == 0)
{
if (this.sCondition.Length < iSimilarity)
this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - (sCondition.Length + 1));
else
//TODO preveri hevristiko, mogoce je bolje ce se doda default rule namesto rulea od starsa
dictApplicableRules.Add(ltnParentNode.lrBestRule, 0);
}
}
//TODO can optimize this step using sorted list (dont add if it's worse than the worst)
var lSortedRules = new List<RuleWeighted>();
foreach (var kvp in dictApplicableRules)
{
lSortedRules.Add(new RuleWeighted(kvp.Key, kvp.Value / dWeight));
}
lSortedRules.Sort();
//keep just best iMaxRulesPerNode rules
var iNumRules = lSortedRules.Count;
if (lsett.iMaxRulesPerNode > 0)
iNumRules = Math.Min(lSortedRules.Count, lsett.iMaxRulesPerNode);
aBestRules = new RuleWeighted[iNumRules];
for (var iRule = 0; iRule < iNumRules; iRule++)
{
aBestRules[iRule] = lSortedRules[iRule];
}
//set best rule
lrBestRule = aBestRules[0].Rule;
//TODO must check if this hevristics is OK (to privilige parent rule)
if (ltnParentNode != null)
{
for (int iRule = 0; iRule < lSortedRules.Count &&
lSortedRules[iRule].Weight == lSortedRules[0].Weight; iRule++)
{
if (lSortedRules[iRule].Rule == ltnParentNode.lrBestRule)
{
lrBestRule = lSortedRules[iRule].Rule;
break;
}
}
}
}
private void AddSubAll()
{
int iStartGroup = iStart;
var chCharPrev = '\0';
var bSubGroupNeeded = false;
for (var iWrd = iStart; iWrd <= iEnd; iWrd++)
{
var sWord = elExamples[iWrd].Word;
var chCharThis = sWord.Length > iSimilarity ? sWord[sWord.Length - 1 - iSimilarity] : '\0';
if (iWrd != iStart && chCharPrev != chCharThis)
{
if (bSubGroupNeeded)
{
AddSub(iStartGroup, iWrd - 1, chCharPrev);
bSubGroupNeeded = false;
}
iStartGroup = iWrd;
}
//TODO check out bSubGroupNeeded when there are multiple posible rules (not just lrBestRule)
if (elExamples[iWrd].Rule != lrBestRule)
{
bSubGroupNeeded = true;
}
chCharPrev = chCharThis;
}
if (bSubGroupNeeded && iStartGroup != iStart)
{
AddSub(iStartGroup, iEnd, chCharPrev);
}
}
private void AddSub(int iStart, int iEnd, char chChar)
{
var ltnSub = new LemmaTreeNode(lsett, elExamples, iStart, iEnd, this);
//TODO - maybe not realy appropriate because loosing statisitcs from multiple possible rules
if (ltnSub.lrBestRule == lrBestRule && ltnSub.dictSubNodes == null)
return;
if (dictSubNodes == null)
dictSubNodes = new Dictionary<char, LemmaTreeNode>();
dictSubNodes.Add(chChar, ltnSub);
}
#endregion
#region Essential Class Functions (running model = lemmatizing)
public bool ConditionSatisfied(string sWord)
{
//if (bWholeWord)
// return sWord == sCondition;
//else
// return sWord.EndsWith(sCondition);
var iDiff = sWord.Length - sCondition.Length;
if (iDiff < 0 || (bWholeWord && iDiff > 0))
return false;
var iWrdEnd = sCondition.Length - ltnParentNode.sCondition.Length - 1;
for (var iChar = 0; iChar < iWrdEnd; iChar++)
{
if (sCondition[iChar] != sWord[iChar + iDiff])
return false;
}
return true;
}
public string Lemmatize(string sWord)
{
if (sWord.Length >= iSimilarity && dictSubNodes != null)
{
char chChar = sWord.Length > iSimilarity ? sWord[sWord.Length - 1 - iSimilarity] : '\0';
if (dictSubNodes.ContainsKey(chChar) && dictSubNodes[chChar].ConditionSatisfied(sWord))
return dictSubNodes[chChar].Lemmatize(sWord);
}
return lrBestRule.Lemmatize(sWord);
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
var sb = new StringBuilder();
ToString(sb, 0);
return sb.ToString();
}
private void ToString(StringBuilder sb, int iLevel)
{
sb.Append(new string('\t', iLevel));
sb.AppendFormat("Suffix=\"{0}{1}\"; ", bWholeWord ? "^" : string.Empty, sCondition);
sb.AppendFormat("Rule=\"{0}\"; ", lrBestRule);
sb.AppendFormat("Weight=\"{0}\"; ", dWeight);
if (aBestRules != null && aBestRules.Length > 0)
sb.AppendFormat("Cover={0}; ", aBestRules[0].Weight);
sb.Append("Rulles=");
if (aBestRules != null)
{
foreach (var rw in aBestRules)
sb.AppendFormat(" {0}", rw);
}
sb.Append("; ");
sb.AppendLine();
if (dictSubNodes != null)
{
foreach (var ltnChild in dictSubNodes.Values)
{
ltnChild.ToString(sb, iLevel + 1);
}
}
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt)
{
binWrt.Write(dictSubNodes != null);
if (dictSubNodes != null)
{
binWrt.Write(dictSubNodes.Count);
foreach (var kvp in dictSubNodes)
{
binWrt.Write(kvp.Key);
kvp.Value.Serialize(binWrt);
}
}
binWrt.Write(iSimilarity);
binWrt.Write(sCondition);
binWrt.Write(bWholeWord);
binWrt.Write(lrBestRule.Signature);
binWrt.Write(aBestRules.Length);
for (var i = 0; i < aBestRules.Length; i++)
{
binWrt.Write(aBestRules[i].Rule.Signature);
binWrt.Write(aBestRules[i].Weight);
}
binWrt.Write(dWeight);
binWrt.Write(iStart);
binWrt.Write(iEnd);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode)
{
this.lsett = lsett;
if (binRead.ReadBoolean())
{
dictSubNodes = new Dictionary<char, LemmaTreeNode>();
var iCount = binRead.ReadInt32();
for (var i = 0; i < iCount; i++)
{
var cKey = binRead.ReadChar();
var ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this);
dictSubNodes.Add(cKey, ltrSub);
}
}
else
{
dictSubNodes = null;
}
this.ltnParentNode = ltnParentNode;
iSimilarity = binRead.ReadInt32();
sCondition = binRead.ReadString();
bWholeWord = binRead.ReadBoolean();
lrBestRule = elExamples.Rules[binRead.ReadString()];
var iCountBest = binRead.ReadInt32();
aBestRules = new RuleWeighted[iCountBest];
for (var i = 0; i < iCountBest; i++)
{
aBestRules[i] =
new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble());
}
dWeight = binRead.ReadDouble();
iStart = binRead.ReadInt32();
iEnd = binRead.ReadInt32();
this.elExamples = elExamples;
}
public LemmaTreeNode(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode)
{
Deserialize(binRead, lsett, elExamples, ltnParentNode);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
binWrt.WriteBool(dictSubNodes != null);
if (dictSubNodes != null) {
binWrt.WriteInt(dictSubNodes.Count);
foreach (KeyValuePair<char, LemmaTreeNode> kvp in dictSubNodes) {
binWrt.WriteChar(kvp.Key);
kvp.Value.Save(binWrt);
}
}
binWrt.WriteInt(iSimilarity);
binWrt.WriteString(sCondition);
binWrt.WriteBool(bWholeWord);
binWrt.WriteString(lrBestRule.Signature);
binWrt.WriteInt(aBestRules.Length);
for (int i = 0; i < aBestRules.Length; i++) {
binWrt.WriteString(aBestRules[i].Rule.Signature);
binWrt.WriteDouble(aBestRules[i].Weight);
}
binWrt.WriteDouble(dWeight);
binWrt.WriteInt(iStart);
binWrt.WriteInt(iEnd);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) {
this.lsett = lsett;
if (binRead.ReadBool()) {
dictSubNodes = new Dictionary<char, LemmaTreeNode>();
int iCount = binRead.ReadInt();
for (int i = 0; i < iCount; i++) {
char cKey = binRead.ReadChar();
LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this);
dictSubNodes.Add(cKey, ltrSub);
}
}
else
dictSubNodes = null;
this.ltnParentNode = ltnParentNode;
iSimilarity = binRead.ReadInt();
sCondition = binRead.ReadString();
bWholeWord = binRead.ReadBool();
lrBestRule = elExamples.Rules[binRead.ReadString()];
int iCountBest = binRead.ReadInt();
aBestRules = new RuleWeighted[iCountBest];
for (int i = 0; i < iCountBest; i++)
aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble());
dWeight = binRead.ReadDouble();
iStart = binRead.ReadInt();
iEnd = binRead.ReadInt();
this.elExamples = elExamples;
}
public LemmaTreeNode(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) {
Load(binRead, lsett, elExamples, ltnParentNode);
}
#endif
#endregion
#region Other (Temporarly)
//TODO - this is temp function, remove it
public bool CheckConsistency()
{
var bReturn = true;
if (dictSubNodes != null)
foreach (var ltnChild in dictSubNodes.Values)
bReturn = bReturn &&
ltnChild.CheckConsistency() &&
ltnChild.sCondition.EndsWith(sCondition);
return bReturn;
}
#endregion
}
}

@ -0,0 +1,465 @@
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Runtime.Serialization;
using System.IO.Compression;
using SevenZip;
namespace LemmaSharp
{
[Serializable]
public class Lemmatizer : ITrainableLemmatizer
#if LATINO
, Latino.ISerializable
#endif
{
#region Private Variables
protected LemmatizerSettings lsett;
protected ExampleList elExamples;
protected LemmaTreeNode ltnRootNode;
protected LemmaTreeNode ltnRootNodeFront;
#endregion
#region Constructor(s)
public Lemmatizer() :
this(new LemmatizerSettings())
{ }
public Lemmatizer(LemmatizerSettings lsett)
{
this.lsett = lsett;
this.elExamples = new ExampleList(lsett);
this.ltnRootNode = null;
this.ltnRootNodeFront = null;
}
public Lemmatizer(StreamReader srIn, string sFormat, LemmatizerSettings lsett) : this(lsett)
{
AddMultextFile(srIn, sFormat);
}
#endregion
#region Private Properties
private LemmaTreeNode ltrRootNodeSafe
{
get
{
if (ltnRootNode == null)
BuildModel();
return ltnRootNode;
}
}
private LemmaTreeNode ltrRootNodeFrontSafe
{
get
{
if (ltnRootNodeFront == null && lsett.bBuildFrontLemmatizer)
BuildModel();
return ltnRootNodeFront;
}
}
#endregion
#region Public Properties
public LemmatizerSettings Settings
{
get
{
return lsett.CloneDeep();
}
}
public ExampleList Examples
{
get
{
return elExamples;
}
}
public RuleList Rules
{
get
{
return elExamples.Rules;
}
}
public LemmaTreeNode RootNode
{
get
{
return ltrRootNodeSafe;
}
}
public LemmaTreeNode RootNodeFront
{
get
{
return ltrRootNodeFrontSafe;
}
}
public ILemmatizerModel Model
{
get
{
return ltrRootNodeSafe;
}
}
#endregion
#region Essential Class Functions (adding examples to repository)
public void AddMultextFile(StreamReader srIn, string sFormat)
{
this.elExamples.AddMultextFile(srIn, sFormat);
ltnRootNode = null;
}
public void AddExample(string sWord, string sLemma)
{
AddExample(sWord, sLemma, 1, null);
}
public void AddExample(string sWord, string sLemma, double dWeight)
{
AddExample(sWord, sLemma, dWeight, null);
}
public void AddExample(string sWord, string sLemma, double dWeight, string sMsd)
{
elExamples.AddExample(sWord, sLemma, dWeight, sMsd);
ltnRootNode = null;
}
public void DropExamples()
{
elExamples.DropExamples();
}
public void FinalizeAdditions()
{
elExamples.FinalizeAdditions();
}
#endregion
#region Essential Class Functions (building model & lemmatizing)
public void BuildModel()
{
if (ltnRootNode != null)
return;
if (!lsett.bBuildFrontLemmatizer)
{
//TODO remove: elExamples.FinalizeAdditions();
elExamples.FinalizeAdditions();
ltnRootNode = new LemmaTreeNode(lsett, elExamples);
}
else
{
ltnRootNode = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(false));
ltnRootNodeFront = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(true));
}
}
public string Lemmatize(string sWord)
{
if (!lsett.bBuildFrontLemmatizer)
{
return ltrRootNodeSafe.Lemmatize(sWord);
}
var sWordFront = LemmaExample.StringReverse(sWord);
var sLemmaFront = ltrRootNodeFrontSafe.Lemmatize(sWordFront);
var sWordRear = LemmaExample.StringReverse(sLemmaFront);
return ltrRootNodeSafe.Lemmatize(sWordRear);
}
#endregion
#region Serialization Functions (ISerializable)
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
info.AddValue("lsett", lsett);
info.AddValue("elExamples", elExamples);
}
public Lemmatizer(SerializationInfo info, StreamingContext context) : this()
{
lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings));
elExamples = (ExampleList)info.GetValue("elExamples", typeof(ExampleList));
this.BuildModel();
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bSerializeExamples)
{
lsett.Serialize(binWrt);
binWrt.Write(bSerializeExamples);
elExamples.Serialize(binWrt, bSerializeExamples, false);
if (!bSerializeExamples)
{
elExamples.GetFrontRearExampleList(false).Serialize(binWrt, bSerializeExamples, false);
elExamples.GetFrontRearExampleList(true).Serialize(binWrt, bSerializeExamples, false);
}
ltnRootNode.Serialize(binWrt);
if (lsett.bBuildFrontLemmatizer)
ltnRootNodeFront.Serialize(binWrt);
}
public void Deserialize(BinaryReader binRead)
{
lsett = new LemmatizerSettings(binRead);
var bSerializeExamples = binRead.ReadBoolean();
elExamples = new ExampleList(binRead, lsett);
ExampleList elExamplesRear;
ExampleList elExamplesFront;
if (bSerializeExamples)
{
elExamplesRear = elExamples.GetFrontRearExampleList(false);
elExamplesFront = elExamples.GetFrontRearExampleList(true);
}
else
{
elExamplesRear = new ExampleList(binRead, lsett);
elExamplesFront = new ExampleList(binRead, lsett);
}
if (!lsett.bBuildFrontLemmatizer)
{
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null);
}
else
{
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamplesRear, null);
ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamplesFront, null);
}
}
//Do not change the order!!! (If new compression algorithms are added, otherwise you will not be able to load old files.)
public enum Compression
{
None,
Deflate,
LZMA
}
public Lemmatizer(BinaryReader binRead)
{
var compr = (Compression)binRead.ReadByte();
if (compr == Compression.None)
Deserialize(binRead);
else
throw new Exception("Loading lemmatizer with binary reader on uncompressed stream is not supported.");
}
public Lemmatizer(Stream streamIn)
{
Deserialize(streamIn);
}
public void Serialize(Stream streamOut)
{
Serialize(streamOut, true, Compression.None);
}
public void Serialize(Stream streamOut, bool bSerializeExamples)
{
Serialize(streamOut, bSerializeExamples, Compression.None);
}
public void Serialize(Stream streamOut, bool bSerializeExamples, Compression compress)
{
streamOut.WriteByte((byte)compress);
switch (compress)
{
case Compression.None:
SerializeNone(streamOut, bSerializeExamples);
break;
case Compression.Deflate:
SerializeDeflate(streamOut, bSerializeExamples);
break;
case Compression.LZMA:
SerializeLZMA(streamOut, bSerializeExamples);
break;
default:
break;
}
}
private void SerializeNone(Stream streamOut, bool bSerializeExamples)
{
using (var binWrt = new BinaryWriter(streamOut))
{
this.Serialize(binWrt, bSerializeExamples);
}
}
private void SerializeDeflate(Stream streamOut, bool bSerializeExamples)
{
using (var streamOutNew = new DeflateStream(streamOut, CompressionMode.Compress, true))
{
using (var binWrt = new BinaryWriter(streamOutNew))
{
this.Serialize(binWrt, bSerializeExamples);
binWrt.Flush();
binWrt.Close();
}
}
}
private void SerializeLZMA(Stream streamOut, bool bSerializeExamples)
{
CoderPropID[] propIDs =
{
CoderPropID.DictionarySize,
CoderPropID.PosStateBits,
CoderPropID.LitContextBits,
CoderPropID.LitPosBits,
CoderPropID.Algorithm,
CoderPropID.NumFastBytes,
CoderPropID.MatchFinder,
CoderPropID.EndMarker
};
Int32 dictionary = 1 << 23;
Int32 posStateBits = 2;
Int32 litContextBits = 3; // for normal files
Int32 litPosBits = 0;
Int32 algorithm = 2;
Int32 numFastBytes = 128;
var mf = "bt4";
var eos = false;
object[] properties =
{
(Int32)(dictionary),
(Int32)(posStateBits),
(Int32)(litContextBits),
(Int32)(litPosBits),
(Int32)(algorithm),
(Int32)(numFastBytes),
mf,
eos
};
using (var msTemp = new MemoryStream())
{
using (var binWrtTemp = new BinaryWriter(msTemp))
{
this.Serialize(binWrtTemp, bSerializeExamples);
msTemp.Position = 0;
var encoder = new SevenZip.Compression.LZMA.Encoder();
encoder.SetCoderProperties(propIDs, properties);
encoder.WriteCoderProperties(streamOut);
var fileSize = msTemp.Length;
for (int i = 0; i < 8; i++)
{
streamOut.WriteByte((Byte)(fileSize >> (8 * i)));
}
encoder.Code(msTemp, streamOut, -1, -1, null);
binWrtTemp.Close();
encoder = null;
}
msTemp.Close();
}
}
public void Deserialize(Stream streamIn)
{
var compr = (Compression)streamIn.ReadByte();
using (var streamInNew = Decompress(streamIn, compr))
{
using (var br = new BinaryReader(streamInNew))
{
Deserialize(br);
}
}
}
private Stream Decompress(Stream streamIn, Compression compress)
{
Stream streamInNew;
switch (compress)
{
case Compression.None:
default:
streamInNew = streamIn;
break;
case Compression.Deflate:
streamInNew = new DeflateStream(streamIn, CompressionMode.Decompress);
break;
case Compression.LZMA:
streamInNew = DecompressLZMA(streamIn);
break;
}
return streamInNew;
}
private Stream DecompressLZMA(Stream streamIn)
{
var properties = new byte[5];
if (streamIn.Read(properties, 0, 5) != 5)
throw new Exception("input .lzma is too short");
var decoder = new SevenZip.Compression.LZMA.Decoder();
decoder.SetDecoderProperties(properties);
long outSize = 0;
for (var i = 0; i < 8; i++)
{
var v = streamIn.ReadByte();
if (v < 0)
throw (new Exception("Can't Read 1"));
outSize |= ((long)(byte)v) << (8 * i);
}
var compressedSize = streamIn.Length - streamIn.Position;
var outStream = new MemoryStream();
decoder.Code(streamIn, outStream, compressedSize, outSize, null);
outStream.Seek(0, 0);
decoder = null;
return outStream;
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
lsett.Save(binWrt);
elExamples.Save(binWrt, true, false);
ltnRootNode.Save(binWrt);
if (lsett.bBuildFrontLemmatizer)
ltnRootNodeFront.Save(binWrt);
}
public void Load(Latino.BinarySerializer binRead) {
lsett = new LemmatizerSettings(binRead);
elExamples = new ExampleList(binRead, lsett);
if (!lsett.bBuildFrontLemmatizer) {
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null);
}
else {
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(false) , null);
ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(true), null);
}
}
public Lemmatizer(Latino.BinarySerializer binRead) {
Load(binRead);
}
public void Save(Stream streamOut) {
Latino.BinarySerializer binWrt = new Latino.BinarySerializer(streamOut);
this.Save(binWrt);
binWrt.Close();
}
public void Load(Stream streamIn) {
Latino.BinarySerializer binRead = new Latino.BinarySerializer(streamIn);
Load(binRead);
binRead.Close();
}
public Lemmatizer(Stream streamIn, string sDummy) {
Load(streamIn);
}
#endif
#endregion
}
}

@ -0,0 +1,143 @@
using System;
using System.IO;
using System.Runtime.Serialization;
namespace LemmaSharp
{
/// <summary>
/// These are the lemmagen algorithm settings that affect speed/power of the learning and lemmatizing algorithm.
/// TODO this class will be probbably removed in the future.
/// </summary>
[Serializable]
public class LemmatizerSettings : ISerializable
{
#region Constructor(s)
public LemmatizerSettings()
{
}
#endregion
#region Sub-Structures
/// <summary>
/// How algorithm considers msd tags.
/// </summary>
public enum MsdConsideration
{
/// <summary>
/// Completely ignores mds tags (join examples with different tags and sum their weihgts).
/// </summary>
Ignore,
/// <summary>
/// Same examples with different msd's are not considered equal and joined.
/// </summary>
Distinct,
/// <summary>
/// Joins examples with different tags (concatenates all msd tags).
/// </summary>
JoinAll,
/// <summary>
/// Joins examples with different tags (concatenates just distinct msd tags - somehow slower).
/// </summary>
JoinDistinct,
/// <summary>
/// Joins examples with different tags (new tag is the left to right substring that all joined examples share).
/// </summary>
JoinSameSubstring
}
#endregion
#region Public Variables
/// <summary>
/// True if from string should be included in rule identifier ([from]->[to]). False if just length of from string is used ([#len]->[to]).
/// </summary>
public bool bUseFromInRules = true;
/// <summary>
/// Specification how algorithm considers msd tags.
/// </summary>
public MsdConsideration eMsdConsider = MsdConsideration.Distinct;
/// <summary>
/// How many of the best rules are kept in memory for each node. Zero means unlimited.
/// </summary>
public int iMaxRulesPerNode = 0;
/// <summary>
/// If true, than build proccess uses few more hevristics to build first left to right lemmatizer (lemmatizes front of the word)
/// </summary>
public bool bBuildFrontLemmatizer = false;
#endregion
#region Cloneable functions
public LemmatizerSettings CloneDeep()
{
return new LemmatizerSettings()
{
bUseFromInRules = this.bUseFromInRules,
eMsdConsider = this.eMsdConsider,
iMaxRulesPerNode = this.iMaxRulesPerNode,
bBuildFrontLemmatizer = this.bBuildFrontLemmatizer
};
}
#endregion
#region Serialization Functions (ISerializable)
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
info.AddValue("bUseFromInRules", bUseFromInRules);
info.AddValue("eMsdConsider", eMsdConsider);
info.AddValue("iMaxRulesPerNode", iMaxRulesPerNode);
info.AddValue("bBuildFrontLemmatizer", bBuildFrontLemmatizer);
}
public LemmatizerSettings(SerializationInfo info, StreamingContext context)
{
bUseFromInRules = info.GetBoolean("bUseFromInRules");
eMsdConsider = (MsdConsideration)info.GetValue("eMsdConsider", typeof(MsdConsideration));
iMaxRulesPerNode = info.GetInt32("iMaxRulesPerNode");
bBuildFrontLemmatizer = info.GetBoolean("bBuildFrontLemmatizer");
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt)
{
binWrt.Write(bUseFromInRules);
binWrt.Write((int)eMsdConsider);
binWrt.Write(iMaxRulesPerNode);
binWrt.Write(bBuildFrontLemmatizer);
}
public void Deserialize(BinaryReader binRead)
{
bUseFromInRules = binRead.ReadBoolean();
eMsdConsider = (MsdConsideration)binRead.ReadInt32();
iMaxRulesPerNode = binRead.ReadInt32();
bBuildFrontLemmatizer = binRead.ReadBoolean();
}
public LemmatizerSettings(System.IO.BinaryReader binRead)
{
this.Deserialize(binRead);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
binWrt.WriteBool(bUseFromInRules);
binWrt.WriteInt((int)eMsdConsider);
binWrt.WriteInt(iMaxRulesPerNode);
binWrt.WriteBool(bBuildFrontLemmatizer);
}
public void Load(Latino.BinarySerializer binRead) {
bUseFromInRules = binRead.ReadBool();
eMsdConsider = (MsdConsideration)binRead.ReadInt();
iMaxRulesPerNode = binRead.ReadInt();
bBuildFrontLemmatizer = binRead.ReadBool();
}
public LemmatizerSettings(Latino.BinarySerializer reader) {
Load(reader);
}
#endif
#endregion
}
}

@ -0,0 +1,161 @@
using System.Collections.Generic;
using System.IO;
namespace LemmaSharp
{
public class RuleList : Dictionary<string, LemmaRule>
{
#region Private Variables
private LemmatizerSettings lsett;
private LemmaRule lrDefaultRule;
#endregion
#region Constructor(s)
public RuleList(LemmatizerSettings lsett)
{
this.lsett = lsett;
lrDefaultRule = AddRule(new LemmaRule("", "", 0, lsett));
}
#endregion
#region Public Properties
public LemmaRule DefaultRule
{
get
{
return lrDefaultRule;
}
}
#endregion
#region Essential Class Functions
public LemmaRule AddRule(LemmaExample le)
{
return AddRule(new LemmaRule(le.Word, le.Lemma, this.Count, lsett));
}
private LemmaRule AddRule(LemmaRule lrRuleNew)
{
LemmaRule lrRuleReturn = null;
if (!this.TryGetValue(lrRuleNew.Signature, out lrRuleReturn))
{
lrRuleReturn = lrRuleNew;
this.Add(lrRuleReturn.Signature, lrRuleReturn);
}
return lrRuleReturn;
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Serialize(binWrt);
//save list items ---------------------------------------
var iCount = this.Count;
binWrt.Write(iCount);
foreach (var kvp in this)
{
binWrt.Write(kvp.Key);
kvp.Value.Serialize(binWrt, false);
}
//default rule is already saved in the list. Here just save its id.
binWrt.Write(lrDefaultRule.Signature);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
//load list items ---------------------------------------
this.Clear();
int iCount = binRead.ReadInt32();
for (var iId = 0; iId < iCount; iId++)
{
var sKey = binRead.ReadString();
var lrVal = new LemmaRule(binRead, this.lsett);
this.Add(sKey, lrVal);
}
//link the default rule just Id was saved.
lrDefaultRule = this[binRead.ReadString()];
}
public RuleList(System.IO.BinaryReader binRead, LemmatizerSettings lsett)
{
this.Deserialize(binRead, lsett);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Save(binWrt);
//save list items ---------------------------------------
int iCount = this.Count;
binWrt.WriteInt(iCount);
foreach (KeyValuePair<string, LemmaRule> kvp in this) {
binWrt.WriteString(kvp.Key);
kvp.Value.Save(binWrt, false);
}
//default rule is already saved in the list. Here just save its id.
binWrt.WriteString(lrDefaultRule.Signature);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
//load list items ---------------------------------------
this.Clear();
int iCount = binRead.ReadInt();
for (int iId = 0; iId < iCount; iId++) {
string sKey = binRead.ReadString();
LemmaRule lrVal = new LemmaRule(binRead, this.lsett);
this.Add(sKey, lrVal);
}
//link the default rule just Id was saved.
lrDefaultRule = this[binRead.ReadString()];
}
public RuleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
#endregion
}
}

@ -0,0 +1,50 @@
using System;
namespace LemmaSharp
{
[Serializable]
class RuleWeighted : IComparable<RuleWeighted>
{
#region Private Variables
private LemmaRule lrRule;
private double dWeight;
#endregion
#region Constructor(s)
public RuleWeighted(LemmaRule lrRule, double dWeight)
{
this.lrRule = lrRule;
this.dWeight = dWeight;
}
#endregion
#region Public Properties
public LemmaRule Rule
{
get { return lrRule; }
}
public double Weight
{
get { return dWeight; }
}
#endregion
#region Essential Class Functions (comparing objects, eg.: for sorting)
public int CompareTo(RuleWeighted rl)
{
if (this.dWeight < rl.dWeight) return 1;
if (this.dWeight > rl.dWeight) return -1;
if (this.lrRule.Id < rl.lrRule.Id) return 1;
if (this.lrRule.Id > rl.lrRule.Id) return -1;
return 0;
}
#endregion
#region Output & Serialization Functions
public override string ToString()
{
return string.Format("{0}{1:(0.00%)}", lrRule, dWeight);
}
#endregion
}
}

@ -0,0 +1,9 @@
using System.Runtime.Serialization;
namespace LemmaSharp
{
public interface ILemmatizer : ISerializable
{
string Lemmatize(string sWord);
}
}

@ -0,0 +1,8 @@
namespace LemmaSharp
{
public interface ILemmatizerModel
{
string Lemmatize(string sWord);
string ToString();
}
}

@ -0,0 +1,12 @@
namespace LemmaSharp
{
public interface ITrainableLemmatizer : ILemmatizer
{
ExampleList Examples { get; }
ILemmatizerModel Model { get; }
void AddExample(string sWord, string sLemma);
void AddExample(string sWord, string sLemma, double dWeight);
void AddExample(string sWord, string sLemma, double dWeight, string sMsd);
void BuildModel();
}
}

@ -0,0 +1,539 @@
/*==========================================================================;
*
* (c) 2004-08 JSI. All rights reserved.
*
* File: BinarySerializer.cs
* Version: 1.0
* Desc: Binary serializer
* Author: Miha Grcar
* Created on: Oct-2004
* Last modified: May-2008
* Revision: May-2008
*
***************************************************************************/
//Remark: Use this file as Latino compatibility checker. When it is included in
// the project it defines symbol LATINO, that should enable all Latino specific
// serialization functions. When excluded, this code will not be created and also
// following Latino namspace will not be added to the project.
using System;
using System.Runtime.InteropServices;
using System.Collections.Generic;
using System.Reflection;
using System.Text;
using System.IO;
#if LATINO
namespace Latino
{
/* .-----------------------------------------------------------------------
|
| Class BinarySerializer
|
'-----------------------------------------------------------------------
*/
public interface ISerializable {
// *** note that you need to implement a constructor that loads the instance if the class implements Latino.ISerializable
void Save(Latino.BinarySerializer writer);
}
public class BinarySerializer
{
private static Dictionary<string, string> m_full_to_short_type_name
= new Dictionary<string, string>();
private static Dictionary<string, string> m_short_to_full_type_name
= new Dictionary<string, string>();
private Stream m_stream;
private string m_data_dir
= ".";
private static void RegisterTypeName(string full_type_name, string short_type_name)
{
m_full_to_short_type_name.Add(full_type_name, short_type_name);
m_short_to_full_type_name.Add(short_type_name, full_type_name);
}
private static string GetFullTypeName(string short_type_name)
{
return m_short_to_full_type_name.ContainsKey(short_type_name) ? m_short_to_full_type_name[short_type_name] : short_type_name;
}
private static string GetShortTypeName(string full_type_name)
{
return m_full_to_short_type_name.ContainsKey(full_type_name) ? m_full_to_short_type_name[full_type_name] : full_type_name;
}
static BinarySerializer()
{
RegisterTypeName(typeof(bool).AssemblyQualifiedName, "b");
RegisterTypeName(typeof(byte).AssemblyQualifiedName, "ui1");
RegisterTypeName(typeof(sbyte).AssemblyQualifiedName, "i1");
RegisterTypeName(typeof(char).AssemblyQualifiedName, "c");
RegisterTypeName(typeof(double).AssemblyQualifiedName, "f8");
RegisterTypeName(typeof(float).AssemblyQualifiedName, "f4");
RegisterTypeName(typeof(int).AssemblyQualifiedName, "i4");
RegisterTypeName(typeof(uint).AssemblyQualifiedName, "ui4");
RegisterTypeName(typeof(long).AssemblyQualifiedName, "i8");
RegisterTypeName(typeof(ulong).AssemblyQualifiedName, "ui8");
RegisterTypeName(typeof(short).AssemblyQualifiedName, "i2");
RegisterTypeName(typeof(ushort).AssemblyQualifiedName, "ui2");
RegisterTypeName(typeof(string).AssemblyQualifiedName, "s");
}
public BinarySerializer(Stream stream)
{
//Utils.ThrowException(stream == null ? new ArgumentNullException("stream") : null);
m_stream = stream;
}
public BinarySerializer()
{
m_stream = new MemoryStream();
}
public BinarySerializer(string file_name, FileMode file_mode)
{
m_stream = new FileStream(file_name, file_mode); // throws ArgumentException, NotSupportedException, ArgumentNullException, SecurityException, FileNotFoundException, IOException, DirectoryNotFoundException, PathTooLongException, ArgumentOutOfRangeException
}
// *** Reading ***
private byte[] Read<T>() // Read<T>() is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
int sz = Marshal.SizeOf(typeof(T));
byte[] buffer = new byte[sz];
int num_bytes = m_stream.Read(buffer, 0, sz); // throws IOException, NotSupportedException, ObjectDisposedException
//Utils.ThrowException(num_bytes < sz ? new EndOfStreamException() : null);
return buffer;
}
public bool ReadBool()
{
return ReadByte() != 0;
}
public byte ReadByte() // ReadByte() is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
int val = m_stream.ReadByte(); // throws NotSupportedException, ObjectDisposedException
//Utils.ThrowException(val < 0 ? new EndOfStreamException() : null);
return (byte)val;
}
public sbyte ReadSByte()
{
return (sbyte)ReadByte();
}
private char ReadChar8()
{
return (char)ReadByte();
}
private char ReadChar16()
{
return BitConverter.ToChar(Read<ushort>(), 0);
}
public char ReadChar()
{
return ReadChar16();
}
public double ReadDouble()
{
return BitConverter.ToDouble(Read<double>(), 0);
}
public float ReadFloat()
{
return BitConverter.ToSingle(Read<float>(), 0);
}
public int ReadInt()
{
return BitConverter.ToInt32(Read<int>(), 0);
}
public uint ReadUInt()
{
return BitConverter.ToUInt32(Read<uint>(), 0);
}
public long ReadLong()
{
return BitConverter.ToInt64(Read<long>(), 0);
}
public ulong ReadULong()
{
return BitConverter.ToUInt64(Read<ulong>(), 0);
}
public short ReadShort()
{
return BitConverter.ToInt16(Read<short>(), 0);
}
public ushort ReadUShort()
{
return BitConverter.ToUInt16(Read<ushort>(), 0);
}
private string ReadString8()
{
int len = ReadInt();
if (len < 0) { return null; }
byte[] buffer = new byte[len];
m_stream.Read(buffer, 0, len); // throws IOException, NotSupportedException, ObjectDisposedException
return Encoding.ASCII.GetString(buffer);
}
private string ReadString16()
{
int len = ReadInt();
if (len < 0) { return null; }
byte[] buffer = new byte[len * 2];
m_stream.Read(buffer, 0, len * 2); // throws IOException, NotSupportedException, ObjectDisposedException
return Encoding.Unicode.GetString(buffer);
}
public string ReadString()
{
return ReadString16(); // throws exceptions (see ReadString16())
}
public Type ReadType()
{
string type_name = ReadString8(); // throws exceptions (see ReadString8())
//Utils.ThrowException(type_name == null ? new InvalidDataException() : null);
return Type.GetType(GetFullTypeName(type_name)); // throws TargetInvocationException, ArgumentException, TypeLoadException, FileNotFoundException, FileLoadException, BadImageFormatException
}
public ValueType ReadValue(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
//Utils.ThrowException(!type.IsValueType ? new InvalidArgumentValueException("type") : null);
if (type == typeof(bool))
{
return ReadBool();
}
else if (type == typeof(byte))
{
return ReadByte();
}
else if (type == typeof(sbyte))
{
return ReadSByte();
}
else if (type == typeof(char))
{
return ReadChar();
}
else if (type == typeof(double))
{
return ReadDouble();
}
else if (type == typeof(float))
{
return ReadFloat();
}
else if (type == typeof(int))
{
return ReadInt();
}
else if (type == typeof(uint))
{
return ReadUInt();
}
else if (type == typeof(long))
{
return ReadLong();
}
else if (type == typeof(ulong))
{
return ReadULong();
}
else if (type == typeof(short))
{
return ReadShort();
}
else if (type == typeof(ushort))
{
return ReadUShort();
}
else if (typeof(Latino.ISerializable).IsAssignableFrom(type))
{
ConstructorInfo cxtor = type.GetConstructor(new Type[] { typeof(Latino.BinarySerializer) });
//Utils.ThrowException(cxtor == null ? new ArgumentNotSupportedException("type") : null);
return (ValueType)cxtor.Invoke(new object[] { this }); // throws MemberAccessException, MethodAccessException, TargetInvocationException, NotSupportedException, SecurityException
}
else
{
//throw new ArgumentNotSupportedException("type");
throw new Exception("type");
}
}
public T ReadValue<T>()
{
return (T)(object)ReadValue(typeof(T)); // throws exceptions (see ReadValue(Type type))
}
public object ReadObject(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
switch (ReadByte())
{
case 0:
return null;
case 1:
break;
case 2:
Type type_0 = ReadType(); // throws exceptions (see ReadType())
//Utils.ThrowException(type_0 == null ? new TypeLoadException() : null);
//Utils.ThrowException(!type.IsAssignableFrom(type_0) ? new InvalidArgumentValueException("type") : null);
type = type_0;
break;
default:
throw new InvalidDataException();
}
if (type == typeof(string))
{
return ReadString();
}
else if (typeof(Latino.ISerializable).IsAssignableFrom(type))
{
ConstructorInfo cxtor = type.GetConstructor(new Type[] { typeof(Latino.BinarySerializer) });
//Utils.ThrowException(cxtor == null ? new ArgumentNotSupportedException("type") : null);
return cxtor.Invoke(new object[] { this }); // throws MemberAccessException, MethodAccessException, TargetInvocationException, NotSupportedException, SecurityException
}
else if (type.IsValueType)
{
return ReadValue(type); // throws exceptions (see ReadValue(Type type))
}
else
{
//throw new InvalidArgumentValueException("type");
throw new Exception("type");
}
}
public T ReadObject<T>()
{
return (T)ReadObject(typeof(T)); // throws exceptions (see ReadObject(Type type))
}
public object ReadValueOrObject(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
if (type.IsValueType)
{
return ReadValue(type); // throws exceptions (see ReadValue(Type type))
}
else
{
return ReadObject(type); // throws exceptions (see ReadObject(Type type))
}
}
public T ReadValueOrObject<T>()
{
return (T)ReadValueOrObject(typeof(T)); // throws exceptions (see ReadValueOrObject(Type type))
}
// *** Writing ***
private void Write(byte[] data) // Write(byte[] data) is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
m_stream.Write(data, 0, data.Length); // throws IOException, NotSupportedException, ObjectDisposedException
}
public void WriteBool(bool val)
{
WriteByte(val ? (byte)1 : (byte)0);
}
public void WriteByte(byte val) // WriteByte(byte val) is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
m_stream.WriteByte(val); // throws IOException, NotSupportedException, ObjectDisposedException
}
public void WriteSByte(sbyte val)
{
WriteByte((byte)val);
}
private void WriteChar8(char val)
{
WriteByte(Encoding.ASCII.GetBytes(new char[] { val })[0]);
}
private void WriteChar16(char val)
{
Write(BitConverter.GetBytes((ushort)val));
}
public void WriteChar(char val)
{
WriteChar16(val);
}
public void WriteDouble(double val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteFloat(float val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteInt(int val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteUInt(uint val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteLong(long val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteULong(ulong val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteShort(short val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteUShort(ushort val)
{
Write(BitConverter.GetBytes(val));
}
private void WriteString8(string val)
{
if (val == null) { WriteInt(-1); return; }
WriteInt(val.Length);
Write(Encoding.ASCII.GetBytes(val));
}
private void WriteString16(string val)
{
if (val == null) { WriteInt(-1); return; }
WriteInt(val.Length);
Write(Encoding.Unicode.GetBytes(val));
}
public void WriteString(string val)
{
WriteString16(val);
}
public void WriteValue(ValueType val)
{
if (val is bool)
{
WriteBool((bool)val);
}
else if (val is byte)
{
WriteByte((byte)val);
}
else if (val is sbyte)
{
WriteSByte((sbyte)val);
}
else if (val is char)
{
WriteChar((char)val);
}
else if (val is double)
{
WriteDouble((double)val);
}
else if (val is float)
{
WriteFloat((float)val);
}
else if (val is int)
{
WriteInt((int)val);
}
else if (val is uint)
{
WriteUInt((uint)val);
}
else if (val is long)
{
WriteLong((long)val);
}
else if (val is ulong)
{
WriteULong((ulong)val);
}
else if (val is short)
{
WriteShort((short)val);
}
else if (val is ushort)
{
WriteUShort((ushort)val);
}
else if (val is Latino.ISerializable)
{
((Latino.ISerializable)val).Save(this); // throws serialization-related exceptions
}
else
{
//throw new ArgumentTypeException("val");
}
}
public void WriteObject(Type type, object obj)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
//Utils.ThrowException((obj != null && !type.IsAssignableFrom(obj.GetType())) ? new ArgumentTypeException("obj") : null);
if (obj == null)
{
WriteByte(0);
}
else
{
Type obj_type = obj.GetType();
if (obj_type == type)
{
WriteByte(1);
}
else
{
WriteByte(2);
WriteType(obj_type);
}
if (obj is string)
{
WriteString((string)obj);
}
else if (obj is Latino.ISerializable)
{
((Latino.ISerializable)obj).Save(this); // throws serialization-related exceptions
}
else if (obj is ValueType)
{
WriteValue((ValueType)obj); // throws exceptions (see WriteValue(ValueType val))
}
else
{
//throw new ArgumentTypeException("obj");
}
}
}
public void WriteObject<T>(T obj)
{
WriteObject(typeof(T), obj); // throws exceptions (see WriteObject(Type type, object obj))
}
public void WriteValueOrObject(Type type, object obj)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
//Utils.ThrowException(!type.IsAssignableFrom(obj.GetType()) ? new ArgumentTypeException("obj") : null);
if (type.IsValueType)
{
WriteValue((ValueType)obj); // throws exceptions (see WriteValue(ValueType val))
}
else
{
WriteObject(type, obj); // throws exceptions (see WriteObject(Type type, object obj))
}
}
public void WriteValueOrObject<T>(T obj)
{
WriteValueOrObject(typeof(T), obj); // throws exceptions (see WriteValueOrObject(Type type, object obj))
}
public void WriteType(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
WriteString8(GetShortTypeName(type.AssemblyQualifiedName));
}
// *** Data directory ***
public string DataDir
{
get { return m_data_dir; }
set
{
//Utils.ThrowException(!Utils.VerifyPathName(value, /*must_exist=*/true) ? new InvalidArgumentValueException("DataDir") : null);
m_data_dir = value;
}
}
// *** Access to the associated stream ***
public void Close()
{
m_stream.Close();
}
public void Flush()
{
m_stream.Flush(); // throws IOException
}
public Stream Stream
{
get { return m_stream; }
}
}
}
#endif

@ -0,0 +1,165 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>9.0.21022</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{A39293C1-92D8-47B9-93A4-41F443B4F9E4}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>LemmaSharp</RootNamespace>
<AssemblyName>LemmaSharp</AssemblyName>
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<IsWebBootstrapper>true</IsWebBootstrapper>
<StartupObject>
</StartupObject>
<FileUpgradeFlags>
</FileUpgradeFlags>
<UpgradeBackupLocation>
</UpgradeBackupLocation>
<OldToolsVersion>3.5</OldToolsVersion>
<TargetFrameworkProfile />
<PublishUrl>http://localhost/LemmaSharp/</PublishUrl>
<Install>true</Install>
<InstallFrom>Web</InstallFrom>
<UpdateEnabled>true</UpdateEnabled>
<UpdateMode>Foreground</UpdateMode>
<UpdateInterval>7</UpdateInterval>
<UpdateIntervalUnits>Days</UpdateIntervalUnits>
<UpdatePeriodically>false</UpdatePeriodically>
<UpdateRequired>false</UpdateRequired>
<MapFileExtensions>true</MapFileExtensions>
<ApplicationRevision>0</ApplicationRevision>
<ApplicationVersion>1.0.0.%2a</ApplicationVersion>
<UseApplicationTrust>false</UseApplicationTrust>
<BootstrapperEnabled>true</BootstrapperEnabled>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>TRACE;DEBUG;NOLATINO</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE;NOLATINO</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x86' ">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x86\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x86</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' ">
<OutputPath>bin\x86\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x86</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x64' ">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x64' ">
<OutputPath>bin\x64\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<ItemGroup>
<Reference Include="Lzma#, Version=4.12.3884.11200, Culture=neutral, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>ExternalLibs\Lzma#.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Data" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="LatinoCompatibility\BinarySerializer.cs">
<SubType>Code</SubType>
</Compile>
<Compile Include="Interfaces\ILemmatizer.cs" />
<Compile Include="Interfaces\ILemmatizerModel.cs" />
<Compile Include="Interfaces\ILemmatizerTrainable.cs" />
<Compile Include="Classes\LemmatizerSettings.cs" />
<Compile Include="Classes\LemmaRule.cs" />
<Compile Include="Classes\Lemmatizer.cs" />
<Compile Include="Classes\LemmaTreeNode.cs" />
<Compile Include="Classes\LemmaExample.cs" />
<Compile Include="Classes\ExampleList.cs" />
<Compile Include="Classes\RuleList.cs" />
<Compile Include="Classes\RuleWeighted.cs" />
</ItemGroup>
<ItemGroup>
<BootstrapperPackage Include="Microsoft.Net.Client.3.5">
<Visible>False</Visible>
<ProductName>.NET Framework Client Profile</ProductName>
<Install>false</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.2.0">
<Visible>False</Visible>
<ProductName>.NET Framework 2.0 %28x86%29</ProductName>
<Install>true</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.0">
<Visible>False</Visible>
<ProductName>.NET Framework 3.0 %28x86%29</ProductName>
<Install>false</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5</ProductName>
<Install>false</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5 SP1</ProductName>
<Install>false</Install>
</BootstrapperPackage>
</ItemGroup>
<ItemGroup>
<Folder Include="Properties\" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

@ -0,0 +1,34 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25420.1
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaSharp", "LemmaSharp\LemmaSharp.csproj", "{A39293C1-92D8-47B9-93A4-41F443B4F9E4}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|Any CPU = Release|Any CPU
Release|x64 = Release|x64
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x64.ActiveCfg = Debug|x64
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x64.Build.0 = Debug|x64
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x86.ActiveCfg = Debug|x86
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x86.Build.0 = Debug|x86
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|Any CPU.Build.0 = Release|Any CPU
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x64.ActiveCfg = Release|x64
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x64.Build.0 = Release|x64
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x86.ActiveCfg = Release|x86
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x86.Build.0 = Release|x86
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

@ -0,0 +1,381 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.Serialization;
using System.Text;
namespace LemmaSharp
{
[Serializable]
public class ExampleList : ISerializable
{
#region Private Variables
private LemmatizerSettings lsett;
private RuleList rlRules;
private Dictionary<string, LemmaExample> dictExamples;
private List<LemmaExample> lstExamples;
#endregion
#region Constructor(s)
public ExampleList(LemmatizerSettings lsett) : base()
{
this.lsett = lsett;
this.dictExamples = new Dictionary<string, LemmaExample>();
this.lstExamples = null;
this.rlRules = new RuleList(lsett);
}
public ExampleList(StreamReader srIn, string sFormat, LemmatizerSettings lsett) : this(lsett)
{
AddMultextFile(srIn, sFormat);
}
#endregion
#region Public Properties & Indexers
public LemmaExample this[int i]
{
get
{
if (lstExamples == null)
FinalizeAdditions();
return lstExamples[i];
}
}
public int Count
{
get
{
if (lstExamples == null)
FinalizeAdditions();
return lstExamples.Count;
}
}
public double WeightSum
{
get
{
if (lstExamples == null)
FinalizeAdditions();
double dWeight = 0;
foreach (LemmaExample exm in lstExamples)
dWeight += exm.Weight;
return dWeight;
}
}
public RuleList Rules
{
get
{
return rlRules;
}
}
public List<LemmaExample> ListExamples
{
get
{
if (lstExamples == null)
FinalizeAdditions();
return lstExamples;
}
}
#endregion
#region Essential Class Functions (adding/removing examples)
public void AddMultextFile(StreamReader srIn, string sFormat)
{
//read from file
string sLine = null;
int iError = 0;
int iLine = 0;
var iW = sFormat.IndexOf('W');
var iL = sFormat.IndexOf('L');
var iM = sFormat.IndexOf('M');
var iF = sFormat.IndexOf('F');
var iLen = Math.Max(Math.Max(iW, iL), Math.Max(iM, iF)) + 1;
if (iW < 0 || iL < 0)
{
throw new Exception("Can not find word and lemma location in the format specification");
}
while ((sLine = srIn.ReadLine()) != null && iError < 50)
{
iLine++;
string[] asWords = sLine.Split(new char[] { '\t' });
if (asWords.Length < iLen)
{
//Console.WriteLine("ERROR: Line doesn't confirm to the given format \"" + sFormat + "\"! Line " + iLine.ToString() + ".");
iError++;
continue;
}
var sWord = asWords[iW];
var sLemma = asWords[iL];
if (sLemma.Equals("=", StringComparison.Ordinal))
sLemma = sWord;
string sMsd = null;
if (iM > -1)
sMsd = asWords[iM];
double dWeight = 1; ;
if (iF > -1)
Double.TryParse(asWords[iM], out dWeight);
AddExample(sWord, sLemma, dWeight, sMsd);
}
if (iError == 50)
throw new Exception("Parsing stopped because of too many (50) errors. Check format specification");
}
public LemmaExample AddExample(string sWord, string sLemma, double dWeight, string sMsd)
{
string sNewMsd = lsett.eMsdConsider != LemmatizerSettings.MsdConsideration.Ignore
? sMsd
: null;
var leNew = new LemmaExample(sWord, sLemma, dWeight, sNewMsd, rlRules, lsett);
return Add(leNew);
}
private LemmaExample Add(LemmaExample leNew)
{
LemmaExample leReturn = null;
if (!dictExamples.TryGetValue(leNew.Signature, out leReturn))
{
leReturn = leNew;
dictExamples.Add(leReturn.Signature, leReturn);
}
else
leReturn.Join(leNew);
lstExamples = null;
return leReturn;
}
public void DropExamples()
{
dictExamples.Clear();
lstExamples = null;
}
public void FinalizeAdditions()
{
if (lstExamples != null)
return;
lstExamples = new List<LemmaExample>(dictExamples.Values);
lstExamples.Sort();
}
public ExampleList GetFrontRearExampleList(bool front)
{
var elExamplesNew = new ExampleList(lsett);
foreach (var le in this.ListExamples)
{
if (front)
elExamplesNew.AddExample(le.WordFront, le.LemmaFront, le.Weight, le.Msd);
else
elExamplesNew.AddExample(le.WordRear, le.LemmaRear, le.Weight, le.Msd);
}
elExamplesNew.FinalizeAdditions();
return elExamplesNew;
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
var sb = new StringBuilder();
foreach (var exm in lstExamples)
{
sb.AppendLine(exm.ToString());
}
return sb.ToString();
}
#endregion
#region Serialization Functions (.Net Default - ISerializable)
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
info.AddValue("lsett", lsett);
info.AddValue("iNumExamples", dictExamples.Count);
var aWords = new string[dictExamples.Count];
var aLemmas = new string[dictExamples.Count];
var aWeights = new double[dictExamples.Count];
var aMsds = new string[dictExamples.Count];
int iExm = 0;
foreach (var exm in dictExamples.Values)
{
aWords[iExm] = exm.Word;
aLemmas[iExm] = exm.Lemma;
aWeights[iExm] = exm.Weight;
aMsds[iExm] = exm.Msd;
iExm++;
}
info.AddValue("aWords", aWords);
info.AddValue("aLemmas", aLemmas);
info.AddValue("aWeights", aWeights);
info.AddValue("aMsds", aMsds);
}
public ExampleList(SerializationInfo info, StreamingContext context)
{
lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings));
this.dictExamples = new Dictionary<string, LemmaExample>();
this.lstExamples = null;
this.rlRules = new RuleList(lsett);
var aWords = (string[])info.GetValue("aWords", typeof(string[]));
var aLemmas = (string[])info.GetValue("aLemmas", typeof(string[]));
var aWeights = (double[])info.GetValue("aWeights", typeof(double[]));
var aMsds = (string[])info.GetValue("aMsds", typeof(string[]));
for (int iExm = 0; iExm < aWords.Length; iExm++)
AddExample(aWords[iExm], aLemmas[iExm], aWeights[iExm], aMsds[iExm]);
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bSerializeExamples, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Serialize(binWrt);
rlRules.Serialize(binWrt, false);
if (!bSerializeExamples)
{
binWrt.Write(false); // lstExamples == null
binWrt.Write(0); // dictExamples.Count == 0
}
else
{
if (lstExamples == null)
{
binWrt.Write(false); // lstExamples == null
//save dictionary items
int iCount = dictExamples.Count;
binWrt.Write(iCount);
foreach (var kvp in dictExamples)
{
binWrt.Write(kvp.Value.Rule.Signature);
kvp.Value.Serialize(binWrt, false);
}
}
else
{
binWrt.Write(true); // lstExamples != null
//save list & dictionary items
var iCount = lstExamples.Count;
binWrt.Write(iCount);
foreach (var le in lstExamples)
{
binWrt.Write(le.Rule.Signature);
le.Serialize(binWrt, false);
}
}
}
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
rlRules = new RuleList(binRead, this.lsett);
var bCreateLstExamples = binRead.ReadBoolean();
lstExamples = bCreateLstExamples ? new List<LemmaExample>() : null;
dictExamples = new Dictionary<string, LemmaExample>();
//load dictionary items
var iCount = binRead.ReadInt32();
for (var iId = 0; iId < iCount; iId++)
{
var lrRule = rlRules[binRead.ReadString()];
var le = new LemmaExample(binRead, this.lsett, lrRule);
dictExamples.Add(le.Signature, le);
if (bCreateLstExamples)
lstExamples.Add(le);
}
}
public ExampleList(BinaryReader binRead, LemmatizerSettings lsett)
{
Deserialize(binRead, lsett);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bSerializeExamples, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Save(binWrt);
rlRules.Save(binWrt, false);
if (!bSerializeExamples) {
binWrt.WriteBool(false); // lstExamples == null
binWrt.WriteInt(0); // dictExamples.Count == 0
}
else {
if (lstExamples == null) {
binWrt.WriteBool(false); // lstExamples == null
//save dictionary items
int iCount = dictExamples.Count;
binWrt.WriteInt(iCount);
foreach (KeyValuePair<string, LemmaExample> kvp in dictExamples) {
binWrt.WriteString(kvp.Value.Rule.Signature);
kvp.Value.Save(binWrt, false);
}
}
else {
binWrt.WriteBool(true); // lstExamples != null
//save list & dictionary items
int iCount = lstExamples.Count;
binWrt.WriteInt(iCount);
foreach (LemmaExample le in lstExamples) {
binWrt.WriteString(le.Rule.Signature);
le.Save(binWrt, false);
}
}
}
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
rlRules = new RuleList(binRead, this.lsett);
bool bCreateLstExamples = binRead.ReadBool();
lstExamples = bCreateLstExamples ? new List<LemmaExample>() : null;
dictExamples = new Dictionary<string, LemmaExample>();
//load dictionary items
int iCount = binRead.ReadInt();
for (int iId = 0; iId < iCount; iId++) {
LemmaRule lrRule = rlRules[binRead.ReadString()];
LemmaExample le = new LemmaExample(binRead, this.lsett, lrRule);
dictExamples.Add(le.Signature, le);
if (bCreateLstExamples) lstExamples.Add(le);
}
}
public ExampleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
#endregion
}
}

@ -0,0 +1,481 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace LemmaSharp
{
public class LemmaExample : IComparable<LemmaExample>, IComparer<LemmaExample>
{
#region Private Variables
private string sWord;
private string sLemma;
private string sSignature;
private string sMsd;
private double dWeight;
private LemmaRule lrRule;
private LemmatizerSettings lsett;
private string sWordRearCache;
private string sWordFrontCache;
private string sLemmaFrontCache;
#endregion
#region Constructor(s)
public LemmaExample(string sWord, string sLemma, double dWeight, string sMsd, RuleList rlRules, LemmatizerSettings lsett)
{
this.lsett = lsett;
this.sWord = sWord;
this.sLemma = sLemma;
this.sMsd = sMsd;
this.dWeight = dWeight;
this.lrRule = rlRules.AddRule(this);
switch (lsett.eMsdConsider)
{
case LemmatizerSettings.MsdConsideration.Ignore:
case LemmatizerSettings.MsdConsideration.JoinAll:
case LemmatizerSettings.MsdConsideration.JoinDistinct:
case LemmatizerSettings.MsdConsideration.JoinSameSubstring:
sSignature = string.Format("[{0}]==>[{1}]", sWord, sLemma);
break;
case LemmatizerSettings.MsdConsideration.Distinct:
default:
sSignature = string.Format("[{0}]==>[{1}]({2})", sWord, sLemma, sMsd ?? "");
break;
}
this.sWordRearCache = null;
this.sWordFrontCache = null;
this.sLemmaFrontCache = null;
}
#endregion
#region Public Properties
public string Word
{
get
{
return sWord;
}
}
public string Lemma
{
get
{
return sLemma;
}
}
public string Msd
{
get
{
return sMsd;
}
}
public string Signature
{
get
{
return sSignature;
}
}
public double Weight
{
get
{
return dWeight;
}
}
public LemmaRule Rule
{
get
{
return lrRule;
}
}
/// <summary>
/// Word to be pre-lemmatized with Front-Lemmatizer into LemmaFront which is then lemmatized by standard Rear-Lemmatizer (Warning it is reversed)
/// </summary>
public string WordFront
{
get
{
if (sWordFrontCache == null)
sWordFrontCache = StringReverse(sWord);
return sWordFrontCache;
}
}
/// <summary>
/// Lemma to be produced by pre-lemmatizing with Front-Lemmatizer (Warning it is reversed)
/// </summary>
public string LemmaFront
{
get
{
if (sLemmaFrontCache == null)
sLemmaFrontCache = StringReverse(WordRear);
return sLemmaFrontCache;
}
}
/// <summary>
/// word to be lemmatized by standard Rear-Lemmatizer (it's beggining has been already modified by Front-Lemmatizer)
/// </summary>
public string WordRear
{
get
{
if (sWordRearCache == null)
{
int lemmaPos = 0, wordPos = 0;
var common = LongestCommonSubstring(sWord, sLemma, ref wordPos, ref lemmaPos);
sWordRearCache = lemmaPos == -1 ? sLemma : (sLemma.Substring(0, lemmaPos + common.Length) + sWord.Substring(wordPos + common.Length));
}
return sWordRearCache;
}
}
/// <summary>
/// lemma to be produced by standard Rear-Lemmatizer from WordRear
/// </summary>
public string LemmaRear
{
get
{
return sLemma;
}
}
#endregion
#region Essential Class Functions (joining two examples into one)
//TODO - this function is not totaly ok because sMsd should not be
//changed since it could be included in signature
public void Join(LemmaExample leJoin)
{
dWeight += leJoin.dWeight;
if (sMsd != null)
switch (lsett.eMsdConsider)
{
case LemmatizerSettings.MsdConsideration.Ignore:
sMsd = null;
break;
case LemmatizerSettings.MsdConsideration.Distinct:
break;
case LemmatizerSettings.MsdConsideration.JoinAll:
sMsd += "|" + leJoin.sMsd;
break;
case LemmatizerSettings.MsdConsideration.JoinDistinct:
var append = string.Format("|{0}", leJoin.sMsd);
if (false == sMsd.Equals(leJoin.sMsd, StringComparison.Ordinal) &&
sMsd.IndexOf(append) < 0)
{
sMsd += append;
}
break;
case LemmatizerSettings.MsdConsideration.JoinSameSubstring:
int iPos = 0;
var iMax = Math.Min(sMsd.Length, leJoin.sMsd.Length);
while (iPos < iMax && sMsd[iPos] == leJoin.sMsd[iPos])
iPos++;
sMsd = sMsd.Substring(0, iPos);
break;
default:
break;
}
}
#endregion
#region Essential Class Functions (calculating similarities betwen examples)
public int Similarity(LemmaExample le)
{
return Similarity(this, le);
}
public static int Similarity(LemmaExample le1, LemmaExample le2)
{
var sWord1 = le1.sWord;
var sWord2 = le2.sWord;
var iLen1 = sWord1.Length;
var iLen2 = sWord2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
for (var iPos = 1; iPos <= iMaxLen; iPos++)
{
if (sWord1[iLen1 - iPos] != sWord2[iLen2 - iPos])
return iPos - 1;
}
//TODO similarity should be bigger if two words are totaly equal
//if (sWord1 == sWord2)
// return iMaxLen + 1;
//else
return iMaxLen;
}
#endregion
#region Essential Class Functions (comparing examples - eg.: for sorting)
/// <summary>
/// Function used to comprare current MultextExample (ME) against argument ME.
/// Mainly used in for sorting lists of MEs.
/// </summary>
/// <param name="other"> MultextExample (ME) that we compare current ME against.</param>
/// <returns>1 if current ME is bigger, -1 if smaler and 0 if both are the same.</returns>
public int CompareTo(LemmaExample other)
{
var iComparison = CompareStrings(this.sWord, other.sWord, false);
if (iComparison != 0)
return iComparison;
iComparison = CompareStrings(this.sLemma, other.sLemma, true);
if (iComparison != 0)
return iComparison;
if (lsett.eMsdConsider == LemmatizerSettings.MsdConsideration.Distinct &&
this.sMsd != null && other.sMsd != null)
{
iComparison = CompareStrings(this.sMsd, other.sMsd, true);
if (iComparison != 0)
return iComparison;
}
return 0;
}
public int Compare(LemmaExample x, LemmaExample y)
{
return x.CompareTo(y);
}
public static int CompareStrings(string sStr1, string sStr2, bool bForward)
{
var iLen1 = sStr1.Length;
var iLen2 = sStr2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
if (bForward)
{
for (int iPos = 0; iPos < iMaxLen; iPos++)
{
if (sStr1[iPos] > sStr2[iPos])
return 1;
if (sStr1[iPos] < sStr2[iPos])
return -1;
}
}
else
{
for (int iPos = 1; iPos <= iMaxLen; iPos++)
{
if (sStr1[iLen1 - iPos] > sStr2[iLen2 - iPos])
return 1;
if (sStr1[iLen1 - iPos] < sStr2[iLen2 - iPos])
return -1;
}
}
if (iLen1 > iLen2)
return 1;
if (iLen1 < iLen2)
return -1;
return 0;
}
public static int EqualPrifixLen(string sStr1, string sStr2)
{
var iLen1 = sStr1.Length;
var iLen2 = sStr2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
for (var iPos = 0; iPos < iMaxLen; iPos++)
{
if (sStr1[iPos] != sStr2[iPos])
return iPos;
}
return iMaxLen;
}
public static string LongestCommonSubstring(string sStr1, string sStr2, ref int iPosInStr1, ref int iPosInStr2)
{
var l = new int[sStr1.Length + 1, sStr2.Length + 1];
int z = 0;
string ret = "";
iPosInStr1 = -1;
iPosInStr2 = -1;
for (var i = 0; i < sStr1.Length; i++)
{
for (var j = 0; j < sStr2.Length; j++)
{
if (sStr1[i] == sStr2[j])
{
if (i == 0 || j == 0)
{
l[i, j] = 1;
}
else
{
l[i, j] = l[i - 1, j - 1] + 1;
}
if (l[i, j] > z)
{
z = l[i, j];
iPosInStr1 = i - z + 1;
iPosInStr2 = j - z + 1;
ret = sStr1.Substring(i - z + 1, z);
}
}
}
}
return ret;
}
public static string StringReverse(string s)
{
if (s == null)
return null;
var charArray = new char[s.Length];
var len = s.Length >> 1;
for (var i = 0; i < len; i++, len--)
{
charArray[i] = s[len];
charArray[len] = s[i];
}
return new string(charArray);
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
var sb = new StringBuilder();
if (sWord != null)
sb.AppendFormat("W:\"{0}\" ", sWord);
if (sLemma != null)
sb.AppendFormat("L:\"{0}\" ", sLemma);
if (sMsd != null)
sb.AppendFormat("M:\"{0}\" ", sMsd);
if (false == Double.IsNaN(dWeight))
sb.AppendFormat("F:\"{0}\" ", dWeight);
if (lrRule != null)
sb.AppendFormat("R:{0} ", lrRule);
if (sb.Length > 0)
return sb.ToString(0, sb.Length - 1);
return string.Empty;
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
binWrt.Write(sWord);
binWrt.Write(sLemma);
binWrt.Write(sSignature);
if (sMsd == null)
{
binWrt.Write(false);
}
else
{
binWrt.Write(true);
binWrt.Write(sMsd);
}
binWrt.Write(dWeight);
//save refernce types if needed -------------------------
if (bThisTopObject)
{
lsett.Serialize(binWrt);
lrRule.Serialize(binWrt, false);
}
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, LemmaRule lrRule)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
sWord = binRead.ReadString();
sLemma = binRead.ReadString();
sSignature = binRead.ReadString();
if (binRead.ReadBoolean())
sMsd = binRead.ReadString();
else
sMsd = null;
dWeight = binRead.ReadDouble();
//load refernce types if needed -------------------------
if (bThisTopObject)
{
this.lsett = new LemmatizerSettings(binRead);
this.lrRule = new LemmaRule(binRead, this.lsett);
}
else
{
this.lsett = lsett;
this.lrRule = lrRule;
}
this.sWordRearCache = null;
this.sWordFrontCache = null;
this.sLemmaFrontCache = null;
}
public LemmaExample(BinaryReader binRead, LemmatizerSettings lsett, LemmaRule lrRule)
{
Deserialize(binRead, lsett, lrRule);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
binWrt.WriteString(sWord);
binWrt.WriteString(sLemma);
binWrt.WriteString(sSignature);
if (sMsd == null)
binWrt.WriteBool(false);
else {
binWrt.WriteBool(true);
binWrt.WriteString(sMsd);
}
binWrt.WriteDouble(dWeight);
//save refernce types if needed -------------------------
if (bThisTopObject) {
lsett.Save(binWrt);
lrRule.Save(binWrt, false);
}
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
sWord = binRead.ReadString();
sLemma = binRead.ReadString();
sSignature = binRead.ReadString();
if (binRead.ReadBool())
sMsd = binRead.ReadString();
else
sMsd = null;
dWeight = binRead.ReadDouble();
//load refernce types if needed -------------------------
if (bThisTopObject) {
this.lsett = new LemmatizerSettings(binRead);
this.lrRule = new LemmaRule(binRead, this.lsett);
}
else {
this.lsett = lsett;
this.lrRule = lrRule;
}
}
public LemmaExample(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule) {
Load(binRead, lsett, lrRule);
}
#endif
#endregion
}
}

@ -0,0 +1,189 @@
using System;
using System.IO;
namespace LemmaSharp
{
public class LemmaRule
{
#region Private Variables
private int iId;
private int iFrom;
private string sFrom;
private string sTo;
private string sSignature;
private LemmatizerSettings lsett;
#endregion
#region Constructor(s)
public LemmaRule(string sWord, string sLemma, int iId, LemmatizerSettings lsett)
{
this.lsett = lsett;
this.iId = iId;
int iSameStem = SameStem(sWord, sLemma);
sTo = sLemma.Substring(iSameStem);
iFrom = sWord.Length - iSameStem;
if (lsett.bUseFromInRules)
{
sFrom = sWord.Substring(iSameStem);
sSignature = string.Format("[{0}]==>[{1}]", sFrom, sTo);
}
else
{
sFrom = null;
sSignature = string.Format("[#{0}]==>[{1}]", iFrom, sTo);
}
}
#endregion
#region Public Properties
public string Signature
{
get
{
return sSignature;
}
}
public int Id
{
get
{
return iId;
}
}
#endregion
#region Essential Class Functions
private static int SameStem(string sStr1, string sStr2)
{
var iLen1 = sStr1.Length;
var iLen2 = sStr2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
for (var iPos = 0; iPos < iMaxLen; iPos++)
{
if (sStr1[iPos] != sStr2[iPos])
return iPos;
}
return iMaxLen;
}
public bool IsApplicableToGroup(int iGroupCondLen)
{
return iGroupCondLen >= iFrom;
}
public string Lemmatize(string sWord)
{
return sWord.Substring(0, sWord.Length - iFrom) + sTo;
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
return string.Format("{0}:{1}", iId, sSignature);
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
binWrt.Write(iId);
binWrt.Write(iFrom);
if (sFrom == null)
binWrt.Write(false);
else
{
binWrt.Write(true);
binWrt.Write(sFrom);
}
binWrt.Write(sTo);
binWrt.Write(sSignature);
if (bThisTopObject)
lsett.Serialize(binWrt);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
iId = binRead.ReadInt32();
iFrom = binRead.ReadInt32();
if (binRead.ReadBoolean())
{
sFrom = binRead.ReadString();
}
else
{
sFrom = null;
}
sTo = binRead.ReadString();
sSignature = binRead.ReadString();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
}
public LemmaRule(System.IO.BinaryReader binRead, LemmatizerSettings lsett)
{
this.Deserialize(binRead, lsett);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
binWrt.WriteInt(iId);
binWrt.WriteInt(iFrom);
if (sFrom == null)
binWrt.WriteBool(false);
else {
binWrt.WriteBool(true);
binWrt.WriteString(sFrom);
}
binWrt.WriteString(sTo);
binWrt.WriteString(sSignature);
if (bThisTopObject)
lsett.Save(binWrt);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
iId = binRead.ReadInt();
iFrom = binRead.ReadInt();
if (binRead.ReadBool())
sFrom = binRead.ReadString();
else
sFrom = null;
sTo = binRead.ReadString();
sSignature = binRead.ReadString();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
}
public LemmaRule(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
#endregion
}
}

@ -0,0 +1,478 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace LemmaSharp
{
[Serializable]
public class LemmaTreeNode : ILemmatizerModel
{
#region Private Variables
//settings
private LemmatizerSettings lsett;
//tree structure references
private Dictionary<char, LemmaTreeNode> dictSubNodes;
private LemmaTreeNode ltnParentNode;
//essential node properties
private int iSimilarity; //similarity among all words in this node
private string sCondition; //suffix that must match in order to lemmatize
private bool bWholeWord; //true if condition has to match to whole word
//rules and weights;
private LemmaRule lrBestRule; //the best rule to be applied when lemmatizing
private RuleWeighted[] aBestRules; //list of best rules
private double dWeight;
//source of this node
private int iStart;
private int iEnd;
private ExampleList elExamples;
#endregion
#region Constructor(s) & Destructor(s)
private LemmaTreeNode(LemmatizerSettings lsett)
{
this.lsett = lsett;
}
public LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples)
: this(lsett, elExamples, 0, elExamples.Count - 1, null)
{
}
/// <summary>
///
/// </summary>
/// <param name="lsett"></param>
/// <param name="elExamples"></param>
/// <param name="iStart">Index of the first word of the current group</param>
/// <param name="iEnd">Index of the last word of the current group</param>
/// <param name="ltnParentNode"></param>
private LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples, int iStart, int iEnd, LemmaTreeNode ltnParentNode) : this(lsett)
{
this.ltnParentNode = ltnParentNode;
this.dictSubNodes = null;
this.iStart = iStart;
this.iEnd = iEnd;
this.elExamples = elExamples;
if (iStart >= elExamples.Count || iEnd >= elExamples.Count || iStart > iEnd)
{
lrBestRule = elExamples.Rules.DefaultRule;
aBestRules = new RuleWeighted[1];
aBestRules[0] = new RuleWeighted(lrBestRule, 0);
dWeight = 0;
return;
}
int iConditionLength = Math.Min(ltnParentNode == null ? 0 : ltnParentNode.iSimilarity + 1, elExamples[iStart].Word.Length);
this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - iConditionLength);
this.iSimilarity = elExamples[iStart].Similarity(elExamples[iEnd]);
this.bWholeWord = ltnParentNode == null ? false : elExamples[iEnd].Word.Length == ltnParentNode.iSimilarity;
FindBestRules();
AddSubAll();
//TODO check this heuristics, can be problematic when there are more applicable rules
if (dictSubNodes != null)
{
var lReplaceNodes = new List<KeyValuePair<char, LemmaTreeNode>>();
foreach (var kvpChild in dictSubNodes)
if (kvpChild.Value.dictSubNodes != null && kvpChild.Value.dictSubNodes.Count == 1)
{
var enumChildChild = kvpChild.Value.dictSubNodes.Values.GetEnumerator();
enumChildChild.MoveNext();
var ltrChildChild = enumChildChild.Current;
if (kvpChild.Value.lrBestRule == lrBestRule)
lReplaceNodes.Add(new KeyValuePair<char, LemmaTreeNode>(kvpChild.Key, ltrChildChild));
}
foreach (var kvpChild in lReplaceNodes)
{
dictSubNodes[kvpChild.Key] = kvpChild.Value;
kvpChild.Value.ltnParentNode = this;
}
}
}
#endregion
#region Public Properties
public int TreeSize
{
get
{
int iCount = 1;
if (dictSubNodes != null)
{
foreach (var ltnChild in dictSubNodes.Values)
{
iCount += ltnChild.TreeSize;
}
}
return iCount;
}
}
public double Weight
{
get
{
return dWeight;
}
}
#endregion
#region Essential Class Functions (building model)
private void FindBestRules()
{
/*
* LINQ SPEED TEST (Slower than current metodology)
*
List<LemmaExample> leApplicable = new List<LemmaExample>();
for (int iExm = iStart; iExm <= iEnd; iExm++)
if (elExamples[iExm].Rule.IsApplicableToGroup(sCondition.Length))
leApplicable.Add(elExamples[iExm]);
List<KeyValuePair<LemmaRule, double>> lBestRules = new List<KeyValuePair<LemmaRule,double>>();
lBestRules.AddRange(
leApplicable.
GroupBy<LemmaExample, LemmaRule, double, KeyValuePair<LemmaRule, double>>(
le => le.Rule,
le => le.Weight,
(lr, enumDbl) => new KeyValuePair<LemmaRule, double>(lr, enumDbl.Aggregate((acc, curr) => acc + curr))
).
OrderBy(kvpLrWght=>kvpLrWght.Value)
);
if (lBestRules.Count > 0)
lrBestRule = lBestRules[0].Key;
else {
lrBestRule = elExamples.Rules.DefaultRule;
}
*/
dWeight = 0;
//calculate dWeight of whole node and calculates qualities for all rules
var dictApplicableRules = new Dictionary<LemmaRule, double>();
//dictApplicableRules.Add(elExamples.Rules.DefaultRule, 0);
while (dictApplicableRules.Count == 0)
{
for (var iExm = iStart; iExm <= iEnd; iExm++)
{
var lr = elExamples[iExm].Rule;
var dExmWeight = elExamples[iExm].Weight;
dWeight += dExmWeight;
if (lr.IsApplicableToGroup(sCondition.Length))
{
if (dictApplicableRules.ContainsKey(lr))
dictApplicableRules[lr] += dExmWeight;
else
dictApplicableRules.Add(lr, dExmWeight);
}
}
//if none found then increase condition length or add some default appliable rule
if (dictApplicableRules.Count == 0)
{
if (this.sCondition.Length < iSimilarity)
this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - (sCondition.Length + 1));
else
//TODO preveri hevristiko, mogoce je bolje ce se doda default rule namesto rulea od starsa
dictApplicableRules.Add(ltnParentNode.lrBestRule, 0);
}
}
//TODO can optimize this step using sorted list (dont add if it's worse than the worst)
var lSortedRules = new List<RuleWeighted>();
foreach (var kvp in dictApplicableRules)
{
lSortedRules.Add(new RuleWeighted(kvp.Key, kvp.Value / dWeight));
}
lSortedRules.Sort();
//keep just best iMaxRulesPerNode rules
var iNumRules = lSortedRules.Count;
if (lsett.iMaxRulesPerNode > 0)
iNumRules = Math.Min(lSortedRules.Count, lsett.iMaxRulesPerNode);
aBestRules = new RuleWeighted[iNumRules];
for (var iRule = 0; iRule < iNumRules; iRule++)
{
aBestRules[iRule] = lSortedRules[iRule];
}
//set best rule
lrBestRule = aBestRules[0].Rule;
//TODO must check if this hevristics is OK (to privilige parent rule)
if (ltnParentNode != null)
{
for (int iRule = 0; iRule < lSortedRules.Count &&
lSortedRules[iRule].Weight == lSortedRules[0].Weight; iRule++)
{
if (lSortedRules[iRule].Rule == ltnParentNode.lrBestRule)
{
lrBestRule = lSortedRules[iRule].Rule;
break;
}
}
}
}
private void AddSubAll()
{
int iStartGroup = iStart;
var chCharPrev = '\0';
var bSubGroupNeeded = false;
for (var iWrd = iStart; iWrd <= iEnd; iWrd++)
{
var sWord = elExamples[iWrd].Word;
var chCharThis = sWord.Length > iSimilarity ? sWord[sWord.Length - 1 - iSimilarity] : '\0';
if (iWrd != iStart && chCharPrev != chCharThis)
{
if (bSubGroupNeeded)
{
AddSub(iStartGroup, iWrd - 1, chCharPrev);
bSubGroupNeeded = false;
}
iStartGroup = iWrd;
}
//TODO check out bSubGroupNeeded when there are multiple posible rules (not just lrBestRule)
if (elExamples[iWrd].Rule != lrBestRule)
{
bSubGroupNeeded = true;
}
chCharPrev = chCharThis;
}
if (bSubGroupNeeded && iStartGroup != iStart)
{
AddSub(iStartGroup, iEnd, chCharPrev);
}
}
private void AddSub(int iStart, int iEnd, char chChar)
{
var ltnSub = new LemmaTreeNode(lsett, elExamples, iStart, iEnd, this);
//TODO - maybe not realy appropriate because loosing statisitcs from multiple possible rules
if (ltnSub.lrBestRule == lrBestRule && ltnSub.dictSubNodes == null)
return;
if (dictSubNodes == null)
dictSubNodes = new Dictionary<char, LemmaTreeNode>();
dictSubNodes.Add(chChar, ltnSub);
}
#endregion
#region Essential Class Functions (running model = lemmatizing)
public bool ConditionSatisfied(string sWord)
{
//if (bWholeWord)
// return sWord == sCondition;
//else
// return sWord.EndsWith(sCondition);
var iDiff = sWord.Length - sCondition.Length;
if (iDiff < 0 || (bWholeWord && iDiff > 0))
return false;
var iWrdEnd = sCondition.Length - ltnParentNode.sCondition.Length - 1;
for (var iChar = 0; iChar < iWrdEnd; iChar++)
{
if (sCondition[iChar] != sWord[iChar + iDiff])
return false;
}
return true;
}
public string Lemmatize(string sWord)
{
if (sWord.Length >= iSimilarity && dictSubNodes != null)
{
char chChar = sWord.Length > iSimilarity ? sWord[sWord.Length - 1 - iSimilarity] : '\0';
if (dictSubNodes.ContainsKey(chChar) && dictSubNodes[chChar].ConditionSatisfied(sWord))
return dictSubNodes[chChar].Lemmatize(sWord);
}
return lrBestRule.Lemmatize(sWord);
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
var sb = new StringBuilder();
ToString(sb, 0);
return sb.ToString();
}
private void ToString(StringBuilder sb, int iLevel)
{
sb.Append(new string('\t', iLevel));
sb.AppendFormat("Suffix=\"{0}{1}\"; ", bWholeWord ? "^" : string.Empty, sCondition);
sb.AppendFormat("Rule=\"{0}\"; ", lrBestRule);
sb.AppendFormat("Weight=\"{0}\"; ", dWeight);
if (aBestRules != null && aBestRules.Length > 0)
sb.AppendFormat("Cover={0}; ", aBestRules[0].Weight);
sb.Append("Rulles=");
if (aBestRules != null)
{
foreach (var rw in aBestRules)
sb.AppendFormat(" {0}", rw);
}
sb.Append("; ");
sb.AppendLine();
if (dictSubNodes != null)
{
foreach (var ltnChild in dictSubNodes.Values)
{
ltnChild.ToString(sb, iLevel + 1);
}
}
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt)
{
binWrt.Write(dictSubNodes != null);
if (dictSubNodes != null)
{
binWrt.Write(dictSubNodes.Count);
foreach (var kvp in dictSubNodes)
{
binWrt.Write(kvp.Key);
kvp.Value.Serialize(binWrt);
}
}
binWrt.Write(iSimilarity);
binWrt.Write(sCondition);
binWrt.Write(bWholeWord);
binWrt.Write(lrBestRule.Signature);
binWrt.Write(aBestRules.Length);
for (var i = 0; i < aBestRules.Length; i++)
{
binWrt.Write(aBestRules[i].Rule.Signature);
binWrt.Write(aBestRules[i].Weight);
}
binWrt.Write(dWeight);
binWrt.Write(iStart);
binWrt.Write(iEnd);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode)
{
this.lsett = lsett;
if (binRead.ReadBoolean())
{
dictSubNodes = new Dictionary<char, LemmaTreeNode>();
var iCount = binRead.ReadInt32();
for (var i = 0; i < iCount; i++)
{
var cKey = binRead.ReadChar();
var ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this);
dictSubNodes.Add(cKey, ltrSub);
}
}
else
{
dictSubNodes = null;
}
this.ltnParentNode = ltnParentNode;
iSimilarity = binRead.ReadInt32();
sCondition = binRead.ReadString();
bWholeWord = binRead.ReadBoolean();
lrBestRule = elExamples.Rules[binRead.ReadString()];
var iCountBest = binRead.ReadInt32();
aBestRules = new RuleWeighted[iCountBest];
for (var i = 0; i < iCountBest; i++)
{
aBestRules[i] =
new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble());
}
dWeight = binRead.ReadDouble();
iStart = binRead.ReadInt32();
iEnd = binRead.ReadInt32();
this.elExamples = elExamples;
}
public LemmaTreeNode(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode)
{
Deserialize(binRead, lsett, elExamples, ltnParentNode);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
binWrt.WriteBool(dictSubNodes != null);
if (dictSubNodes != null) {
binWrt.WriteInt(dictSubNodes.Count);
foreach (KeyValuePair<char, LemmaTreeNode> kvp in dictSubNodes) {
binWrt.WriteChar(kvp.Key);
kvp.Value.Save(binWrt);
}
}
binWrt.WriteInt(iSimilarity);
binWrt.WriteString(sCondition);
binWrt.WriteBool(bWholeWord);
binWrt.WriteString(lrBestRule.Signature);
binWrt.WriteInt(aBestRules.Length);
for (int i = 0; i < aBestRules.Length; i++) {
binWrt.WriteString(aBestRules[i].Rule.Signature);
binWrt.WriteDouble(aBestRules[i].Weight);
}
binWrt.WriteDouble(dWeight);
binWrt.WriteInt(iStart);
binWrt.WriteInt(iEnd);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) {
this.lsett = lsett;
if (binRead.ReadBool()) {
dictSubNodes = new Dictionary<char, LemmaTreeNode>();
int iCount = binRead.ReadInt();
for (int i = 0; i < iCount; i++) {
char cKey = binRead.ReadChar();
LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this);
dictSubNodes.Add(cKey, ltrSub);
}
}
else
dictSubNodes = null;
this.ltnParentNode = ltnParentNode;
iSimilarity = binRead.ReadInt();
sCondition = binRead.ReadString();
bWholeWord = binRead.ReadBool();
lrBestRule = elExamples.Rules[binRead.ReadString()];
int iCountBest = binRead.ReadInt();
aBestRules = new RuleWeighted[iCountBest];
for (int i = 0; i < iCountBest; i++)
aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble());
dWeight = binRead.ReadDouble();
iStart = binRead.ReadInt();
iEnd = binRead.ReadInt();
this.elExamples = elExamples;
}
public LemmaTreeNode(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) {
Load(binRead, lsett, elExamples, ltnParentNode);
}
#endif
#endregion
#region Other (Temporarly)
//TODO - this is temp function, remove it
public bool CheckConsistency()
{
var bReturn = true;
if (dictSubNodes != null)
foreach (var ltnChild in dictSubNodes.Values)
bReturn = bReturn &&
ltnChild.CheckConsistency() &&
ltnChild.sCondition.EndsWith(sCondition);
return bReturn;
}
#endregion
}
}

@ -0,0 +1,465 @@
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Runtime.Serialization;
using System.IO.Compression;
using SevenZip;
namespace LemmaSharp
{
[Serializable]
public class Lemmatizer : ITrainableLemmatizer
#if LATINO
, Latino.ISerializable
#endif
{
#region Private Variables
protected LemmatizerSettings lsett;
protected ExampleList elExamples;
protected LemmaTreeNode ltnRootNode;
protected LemmaTreeNode ltnRootNodeFront;
#endregion
#region Constructor(s)
public Lemmatizer() :
this(new LemmatizerSettings())
{ }
public Lemmatizer(LemmatizerSettings lsett)
{
this.lsett = lsett;
this.elExamples = new ExampleList(lsett);
this.ltnRootNode = null;
this.ltnRootNodeFront = null;
}
public Lemmatizer(StreamReader srIn, string sFormat, LemmatizerSettings lsett) : this(lsett)
{
AddMultextFile(srIn, sFormat);
}
#endregion
#region Private Properties
private LemmaTreeNode ltrRootNodeSafe
{
get
{
if (ltnRootNode == null)
BuildModel();
return ltnRootNode;
}
}
private LemmaTreeNode ltrRootNodeFrontSafe
{
get
{
if (ltnRootNodeFront == null && lsett.bBuildFrontLemmatizer)
BuildModel();
return ltnRootNodeFront;
}
}
#endregion
#region Public Properties
public LemmatizerSettings Settings
{
get
{
return lsett.CloneDeep();
}
}
public ExampleList Examples
{
get
{
return elExamples;
}
}
public RuleList Rules
{
get
{
return elExamples.Rules;
}
}
public LemmaTreeNode RootNode
{
get
{
return ltrRootNodeSafe;
}
}
public LemmaTreeNode RootNodeFront
{
get
{
return ltrRootNodeFrontSafe;
}
}
public ILemmatizerModel Model
{
get
{
return ltrRootNodeSafe;
}
}
#endregion
#region Essential Class Functions (adding examples to repository)
public void AddMultextFile(StreamReader srIn, string sFormat)
{
this.elExamples.AddMultextFile(srIn, sFormat);
ltnRootNode = null;
}
public void AddExample(string sWord, string sLemma)
{
AddExample(sWord, sLemma, 1, null);
}
public void AddExample(string sWord, string sLemma, double dWeight)
{
AddExample(sWord, sLemma, dWeight, null);
}
public void AddExample(string sWord, string sLemma, double dWeight, string sMsd)
{
elExamples.AddExample(sWord, sLemma, dWeight, sMsd);
ltnRootNode = null;
}
public void DropExamples()
{
elExamples.DropExamples();
}
public void FinalizeAdditions()
{
elExamples.FinalizeAdditions();
}
#endregion
#region Essential Class Functions (building model & lemmatizing)
public void BuildModel()
{
if (ltnRootNode != null)
return;
if (!lsett.bBuildFrontLemmatizer)
{
//TODO remove: elExamples.FinalizeAdditions();
elExamples.FinalizeAdditions();
ltnRootNode = new LemmaTreeNode(lsett, elExamples);
}
else
{
ltnRootNode = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(false));
ltnRootNodeFront = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(true));
}
}
public string Lemmatize(string sWord)
{
if (!lsett.bBuildFrontLemmatizer)
{
return ltrRootNodeSafe.Lemmatize(sWord);
}
var sWordFront = LemmaExample.StringReverse(sWord);
var sLemmaFront = ltrRootNodeFrontSafe.Lemmatize(sWordFront);
var sWordRear = LemmaExample.StringReverse(sLemmaFront);
return ltrRootNodeSafe.Lemmatize(sWordRear);
}
#endregion
#region Serialization Functions (ISerializable)
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
info.AddValue("lsett", lsett);
info.AddValue("elExamples", elExamples);
}
public Lemmatizer(SerializationInfo info, StreamingContext context) : this()
{
lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings));
elExamples = (ExampleList)info.GetValue("elExamples", typeof(ExampleList));
this.BuildModel();
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bSerializeExamples)
{
lsett.Serialize(binWrt);
binWrt.Write(bSerializeExamples);
elExamples.Serialize(binWrt, bSerializeExamples, false);
if (!bSerializeExamples)
{
elExamples.GetFrontRearExampleList(false).Serialize(binWrt, bSerializeExamples, false);
elExamples.GetFrontRearExampleList(true).Serialize(binWrt, bSerializeExamples, false);
}
ltnRootNode.Serialize(binWrt);
if (lsett.bBuildFrontLemmatizer)
ltnRootNodeFront.Serialize(binWrt);
}
public void Deserialize(BinaryReader binRead)
{
lsett = new LemmatizerSettings(binRead);
var bSerializeExamples = binRead.ReadBoolean();
elExamples = new ExampleList(binRead, lsett);
ExampleList elExamplesRear;
ExampleList elExamplesFront;
if (bSerializeExamples)
{
elExamplesRear = elExamples.GetFrontRearExampleList(false);
elExamplesFront = elExamples.GetFrontRearExampleList(true);
}
else
{
elExamplesRear = new ExampleList(binRead, lsett);
elExamplesFront = new ExampleList(binRead, lsett);
}
if (!lsett.bBuildFrontLemmatizer)
{
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null);
}
else
{
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamplesRear, null);
ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamplesFront, null);
}
}
//Do not change the order!!! (If new compression algorithms are added, otherwise you will not be able to load old files.)
public enum Compression
{
None,
Deflate,
LZMA
}
public Lemmatizer(BinaryReader binRead)
{
var compr = (Compression)binRead.ReadByte();
if (compr == Compression.None)
Deserialize(binRead);
else
throw new Exception("Loading lemmatizer with binary reader on uncompressed stream is not supported.");
}
public Lemmatizer(Stream streamIn)
{
Deserialize(streamIn);
}
public void Serialize(Stream streamOut)
{
Serialize(streamOut, true, Compression.None);
}
public void Serialize(Stream streamOut, bool bSerializeExamples)
{
Serialize(streamOut, bSerializeExamples, Compression.None);
}
public void Serialize(Stream streamOut, bool bSerializeExamples, Compression compress)
{
streamOut.WriteByte((byte)compress);
switch (compress)
{
case Compression.None:
SerializeNone(streamOut, bSerializeExamples);
break;
case Compression.Deflate:
SerializeDeflate(streamOut, bSerializeExamples);
break;
case Compression.LZMA:
SerializeLZMA(streamOut, bSerializeExamples);
break;
default:
break;
}
}
private void SerializeNone(Stream streamOut, bool bSerializeExamples)
{
using (var binWrt = new BinaryWriter(streamOut))
{
this.Serialize(binWrt, bSerializeExamples);
}
}
private void SerializeDeflate(Stream streamOut, bool bSerializeExamples)
{
using (var streamOutNew = new DeflateStream(streamOut, CompressionMode.Compress, true))
{
using (var binWrt = new BinaryWriter(streamOutNew))
{
this.Serialize(binWrt, bSerializeExamples);
binWrt.Flush();
binWrt.Close();
}
}
}
private void SerializeLZMA(Stream streamOut, bool bSerializeExamples)
{
CoderPropID[] propIDs =
{
CoderPropID.DictionarySize,
CoderPropID.PosStateBits,
CoderPropID.LitContextBits,
CoderPropID.LitPosBits,
CoderPropID.Algorithm,
CoderPropID.NumFastBytes,
CoderPropID.MatchFinder,
CoderPropID.EndMarker
};
Int32 dictionary = 1 << 23;
Int32 posStateBits = 2;
Int32 litContextBits = 3; // for normal files
Int32 litPosBits = 0;
Int32 algorithm = 2;
Int32 numFastBytes = 128;
var mf = "bt4";
var eos = false;
object[] properties =
{
(Int32)(dictionary),
(Int32)(posStateBits),
(Int32)(litContextBits),
(Int32)(litPosBits),
(Int32)(algorithm),
(Int32)(numFastBytes),
mf,
eos
};
using (var msTemp = new MemoryStream())
{
using (var binWrtTemp = new BinaryWriter(msTemp))
{
this.Serialize(binWrtTemp, bSerializeExamples);
msTemp.Position = 0;
var encoder = new SevenZip.Compression.LZMA.Encoder();
encoder.SetCoderProperties(propIDs, properties);
encoder.WriteCoderProperties(streamOut);
var fileSize = msTemp.Length;
for (int i = 0; i < 8; i++)
{
streamOut.WriteByte((Byte)(fileSize >> (8 * i)));
}
encoder.Code(msTemp, streamOut, -1, -1, null);
binWrtTemp.Close();
encoder = null;
}
msTemp.Close();
}
}
public void Deserialize(Stream streamIn)
{
var compr = (Compression)streamIn.ReadByte();
using (var streamInNew = Decompress(streamIn, compr))
{
using (var br = new BinaryReader(streamInNew))
{
Deserialize(br);
}
}
}
private Stream Decompress(Stream streamIn, Compression compress)
{
Stream streamInNew;
switch (compress)
{
case Compression.None:
default:
streamInNew = streamIn;
break;
case Compression.Deflate:
streamInNew = new DeflateStream(streamIn, CompressionMode.Decompress);
break;
case Compression.LZMA:
streamInNew = DecompressLZMA(streamIn);
break;
}
return streamInNew;
}
private Stream DecompressLZMA(Stream streamIn)
{
var properties = new byte[5];
if (streamIn.Read(properties, 0, 5) != 5)
throw new Exception("input .lzma is too short");
var decoder = new SevenZip.Compression.LZMA.Decoder();
decoder.SetDecoderProperties(properties);
long outSize = 0;
for (var i = 0; i < 8; i++)
{
var v = streamIn.ReadByte();
if (v < 0)
throw (new Exception("Can't Read 1"));
outSize |= ((long)(byte)v) << (8 * i);
}
var compressedSize = streamIn.Length - streamIn.Position;
var outStream = new MemoryStream();
decoder.Code(streamIn, outStream, compressedSize, outSize, null);
outStream.Seek(0, 0);
decoder = null;
return outStream;
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
lsett.Save(binWrt);
elExamples.Save(binWrt, true, false);
ltnRootNode.Save(binWrt);
if (lsett.bBuildFrontLemmatizer)
ltnRootNodeFront.Save(binWrt);
}
public void Load(Latino.BinarySerializer binRead) {
lsett = new LemmatizerSettings(binRead);
elExamples = new ExampleList(binRead, lsett);
if (!lsett.bBuildFrontLemmatizer) {
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null);
}
else {
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(false) , null);
ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(true), null);
}
}
public Lemmatizer(Latino.BinarySerializer binRead) {
Load(binRead);
}
public void Save(Stream streamOut) {
Latino.BinarySerializer binWrt = new Latino.BinarySerializer(streamOut);
this.Save(binWrt);
binWrt.Close();
}
public void Load(Stream streamIn) {
Latino.BinarySerializer binRead = new Latino.BinarySerializer(streamIn);
Load(binRead);
binRead.Close();
}
public Lemmatizer(Stream streamIn, string sDummy) {
Load(streamIn);
}
#endif
#endregion
}
}

@ -0,0 +1,143 @@
using System;
using System.IO;
using System.Runtime.Serialization;
namespace LemmaSharp
{
/// <summary>
/// These are the lemmagen algorithm settings that affect speed/power of the learning and lemmatizing algorithm.
/// TODO this class will be probbably removed in the future.
/// </summary>
[Serializable]
public class LemmatizerSettings : ISerializable
{
#region Constructor(s)
public LemmatizerSettings()
{
}
#endregion
#region Sub-Structures
/// <summary>
/// How algorithm considers msd tags.
/// </summary>
public enum MsdConsideration
{
/// <summary>
/// Completely ignores mds tags (join examples with different tags and sum their weihgts).
/// </summary>
Ignore,
/// <summary>
/// Same examples with different msd's are not considered equal and joined.
/// </summary>
Distinct,
/// <summary>
/// Joins examples with different tags (concatenates all msd tags).
/// </summary>
JoinAll,
/// <summary>
/// Joins examples with different tags (concatenates just distinct msd tags - somehow slower).
/// </summary>
JoinDistinct,
/// <summary>
/// Joins examples with different tags (new tag is the left to right substring that all joined examples share).
/// </summary>
JoinSameSubstring
}
#endregion
#region Public Variables
/// <summary>
/// True if from string should be included in rule identifier ([from]->[to]). False if just length of from string is used ([#len]->[to]).
/// </summary>
public bool bUseFromInRules = true;
/// <summary>
/// Specification how algorithm considers msd tags.
/// </summary>
public MsdConsideration eMsdConsider = MsdConsideration.Distinct;
/// <summary>
/// How many of the best rules are kept in memory for each node. Zero means unlimited.
/// </summary>
public int iMaxRulesPerNode = 0;
/// <summary>
/// If true, than build proccess uses few more hevristics to build first left to right lemmatizer (lemmatizes front of the word)
/// </summary>
public bool bBuildFrontLemmatizer = false;
#endregion
#region Cloneable functions
public LemmatizerSettings CloneDeep()
{
return new LemmatizerSettings()
{
bUseFromInRules = this.bUseFromInRules,
eMsdConsider = this.eMsdConsider,
iMaxRulesPerNode = this.iMaxRulesPerNode,
bBuildFrontLemmatizer = this.bBuildFrontLemmatizer
};
}
#endregion
#region Serialization Functions (ISerializable)
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
info.AddValue("bUseFromInRules", bUseFromInRules);
info.AddValue("eMsdConsider", eMsdConsider);
info.AddValue("iMaxRulesPerNode", iMaxRulesPerNode);
info.AddValue("bBuildFrontLemmatizer", bBuildFrontLemmatizer);
}
public LemmatizerSettings(SerializationInfo info, StreamingContext context)
{
bUseFromInRules = info.GetBoolean("bUseFromInRules");
eMsdConsider = (MsdConsideration)info.GetValue("eMsdConsider", typeof(MsdConsideration));
iMaxRulesPerNode = info.GetInt32("iMaxRulesPerNode");
bBuildFrontLemmatizer = info.GetBoolean("bBuildFrontLemmatizer");
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt)
{
binWrt.Write(bUseFromInRules);
binWrt.Write((int)eMsdConsider);
binWrt.Write(iMaxRulesPerNode);
binWrt.Write(bBuildFrontLemmatizer);
}
public void Deserialize(BinaryReader binRead)
{
bUseFromInRules = binRead.ReadBoolean();
eMsdConsider = (MsdConsideration)binRead.ReadInt32();
iMaxRulesPerNode = binRead.ReadInt32();
bBuildFrontLemmatizer = binRead.ReadBoolean();
}
public LemmatizerSettings(System.IO.BinaryReader binRead)
{
this.Deserialize(binRead);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
binWrt.WriteBool(bUseFromInRules);
binWrt.WriteInt((int)eMsdConsider);
binWrt.WriteInt(iMaxRulesPerNode);
binWrt.WriteBool(bBuildFrontLemmatizer);
}
public void Load(Latino.BinarySerializer binRead) {
bUseFromInRules = binRead.ReadBool();
eMsdConsider = (MsdConsideration)binRead.ReadInt();
iMaxRulesPerNode = binRead.ReadInt();
bBuildFrontLemmatizer = binRead.ReadBool();
}
public LemmatizerSettings(Latino.BinarySerializer reader) {
Load(reader);
}
#endif
#endregion
}
}

@ -0,0 +1,161 @@
using System.Collections.Generic;
using System.IO;
namespace LemmaSharp
{
public class RuleList : Dictionary<string, LemmaRule>
{
#region Private Variables
private LemmatizerSettings lsett;
private LemmaRule lrDefaultRule;
#endregion
#region Constructor(s)
public RuleList(LemmatizerSettings lsett)
{
this.lsett = lsett;
lrDefaultRule = AddRule(new LemmaRule("", "", 0, lsett));
}
#endregion
#region Public Properties
public LemmaRule DefaultRule
{
get
{
return lrDefaultRule;
}
}
#endregion
#region Essential Class Functions
public LemmaRule AddRule(LemmaExample le)
{
return AddRule(new LemmaRule(le.Word, le.Lemma, this.Count, lsett));
}
private LemmaRule AddRule(LemmaRule lrRuleNew)
{
LemmaRule lrRuleReturn = null;
if (!this.TryGetValue(lrRuleNew.Signature, out lrRuleReturn))
{
lrRuleReturn = lrRuleNew;
this.Add(lrRuleReturn.Signature, lrRuleReturn);
}
return lrRuleReturn;
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Serialize(binWrt);
//save list items ---------------------------------------
var iCount = this.Count;
binWrt.Write(iCount);
foreach (var kvp in this)
{
binWrt.Write(kvp.Key);
kvp.Value.Serialize(binWrt, false);
}
//default rule is already saved in the list. Here just save its id.
binWrt.Write(lrDefaultRule.Signature);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
//load list items ---------------------------------------
this.Clear();
int iCount = binRead.ReadInt32();
for (var iId = 0; iId < iCount; iId++)
{
var sKey = binRead.ReadString();
var lrVal = new LemmaRule(binRead, this.lsett);
this.Add(sKey, lrVal);
}
//link the default rule just Id was saved.
lrDefaultRule = this[binRead.ReadString()];
}
public RuleList(System.IO.BinaryReader binRead, LemmatizerSettings lsett)
{
this.Deserialize(binRead, lsett);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Save(binWrt);
//save list items ---------------------------------------
int iCount = this.Count;
binWrt.WriteInt(iCount);
foreach (KeyValuePair<string, LemmaRule> kvp in this) {
binWrt.WriteString(kvp.Key);
kvp.Value.Save(binWrt, false);
}
//default rule is already saved in the list. Here just save its id.
binWrt.WriteString(lrDefaultRule.Signature);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
//load list items ---------------------------------------
this.Clear();
int iCount = binRead.ReadInt();
for (int iId = 0; iId < iCount; iId++) {
string sKey = binRead.ReadString();
LemmaRule lrVal = new LemmaRule(binRead, this.lsett);
this.Add(sKey, lrVal);
}
//link the default rule just Id was saved.
lrDefaultRule = this[binRead.ReadString()];
}
public RuleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
#endregion
}
}

@ -0,0 +1,50 @@
using System;
namespace LemmaSharp
{
[Serializable]
class RuleWeighted : IComparable<RuleWeighted>
{
#region Private Variables
private LemmaRule lrRule;
private double dWeight;
#endregion
#region Constructor(s)
public RuleWeighted(LemmaRule lrRule, double dWeight)
{
this.lrRule = lrRule;
this.dWeight = dWeight;
}
#endregion
#region Public Properties
public LemmaRule Rule
{
get { return lrRule; }
}
public double Weight
{
get { return dWeight; }
}
#endregion
#region Essential Class Functions (comparing objects, eg.: for sorting)
public int CompareTo(RuleWeighted rl)
{
if (this.dWeight < rl.dWeight) return 1;
if (this.dWeight > rl.dWeight) return -1;
if (this.lrRule.Id < rl.lrRule.Id) return 1;
if (this.lrRule.Id > rl.lrRule.Id) return -1;
return 0;
}
#endregion
#region Output & Serialization Functions
public override string ToString()
{
return string.Format("{0}{1:(0.00%)}", lrRule, dWeight);
}
#endregion
}
}

@ -0,0 +1,9 @@
using System.Runtime.Serialization;
namespace LemmaSharp
{
public interface ILemmatizer : ISerializable
{
string Lemmatize(string sWord);
}
}

@ -0,0 +1,8 @@
namespace LemmaSharp
{
public interface ILemmatizerModel
{
string Lemmatize(string sWord);
string ToString();
}
}

@ -0,0 +1,12 @@
namespace LemmaSharp
{
public interface ITrainableLemmatizer : ILemmatizer
{
ExampleList Examples { get; }
ILemmatizerModel Model { get; }
void AddExample(string sWord, string sLemma);
void AddExample(string sWord, string sLemma, double dWeight);
void AddExample(string sWord, string sLemma, double dWeight, string sMsd);
void BuildModel();
}
}

@ -0,0 +1,539 @@
/*==========================================================================;
*
* (c) 2004-08 JSI. All rights reserved.
*
* File: BinarySerializer.cs
* Version: 1.0
* Desc: Binary serializer
* Author: Miha Grcar
* Created on: Oct-2004
* Last modified: May-2008
* Revision: May-2008
*
***************************************************************************/
//Remark: Use this file as Latino compatibility checker. When it is included in
// the project it defines symbol LATINO, that should enable all Latino specific
// serialization functions. When excluded, this code will not be created and also
// following Latino namspace will not be added to the project.
using System;
using System.Runtime.InteropServices;
using System.Collections.Generic;
using System.Reflection;
using System.Text;
using System.IO;
#if LATINO
namespace Latino
{
/* .-----------------------------------------------------------------------
|
| Class BinarySerializer
|
'-----------------------------------------------------------------------
*/
public interface ISerializable {
// *** note that you need to implement a constructor that loads the instance if the class implements Latino.ISerializable
void Save(Latino.BinarySerializer writer);
}
public class BinarySerializer
{
private static Dictionary<string, string> m_full_to_short_type_name
= new Dictionary<string, string>();
private static Dictionary<string, string> m_short_to_full_type_name
= new Dictionary<string, string>();
private Stream m_stream;
private string m_data_dir
= ".";
private static void RegisterTypeName(string full_type_name, string short_type_name)
{
m_full_to_short_type_name.Add(full_type_name, short_type_name);
m_short_to_full_type_name.Add(short_type_name, full_type_name);
}
private static string GetFullTypeName(string short_type_name)
{
return m_short_to_full_type_name.ContainsKey(short_type_name) ? m_short_to_full_type_name[short_type_name] : short_type_name;
}
private static string GetShortTypeName(string full_type_name)
{
return m_full_to_short_type_name.ContainsKey(full_type_name) ? m_full_to_short_type_name[full_type_name] : full_type_name;
}
static BinarySerializer()
{
RegisterTypeName(typeof(bool).AssemblyQualifiedName, "b");
RegisterTypeName(typeof(byte).AssemblyQualifiedName, "ui1");
RegisterTypeName(typeof(sbyte).AssemblyQualifiedName, "i1");
RegisterTypeName(typeof(char).AssemblyQualifiedName, "c");
RegisterTypeName(typeof(double).AssemblyQualifiedName, "f8");
RegisterTypeName(typeof(float).AssemblyQualifiedName, "f4");
RegisterTypeName(typeof(int).AssemblyQualifiedName, "i4");
RegisterTypeName(typeof(uint).AssemblyQualifiedName, "ui4");
RegisterTypeName(typeof(long).AssemblyQualifiedName, "i8");
RegisterTypeName(typeof(ulong).AssemblyQualifiedName, "ui8");
RegisterTypeName(typeof(short).AssemblyQualifiedName, "i2");
RegisterTypeName(typeof(ushort).AssemblyQualifiedName, "ui2");
RegisterTypeName(typeof(string).AssemblyQualifiedName, "s");
}
public BinarySerializer(Stream stream)
{
//Utils.ThrowException(stream == null ? new ArgumentNullException("stream") : null);
m_stream = stream;
}
public BinarySerializer()
{
m_stream = new MemoryStream();
}
public BinarySerializer(string file_name, FileMode file_mode)
{
m_stream = new FileStream(file_name, file_mode); // throws ArgumentException, NotSupportedException, ArgumentNullException, SecurityException, FileNotFoundException, IOException, DirectoryNotFoundException, PathTooLongException, ArgumentOutOfRangeException
}
// *** Reading ***
private byte[] Read<T>() // Read<T>() is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
int sz = Marshal.SizeOf(typeof(T));
byte[] buffer = new byte[sz];
int num_bytes = m_stream.Read(buffer, 0, sz); // throws IOException, NotSupportedException, ObjectDisposedException
//Utils.ThrowException(num_bytes < sz ? new EndOfStreamException() : null);
return buffer;
}
public bool ReadBool()
{
return ReadByte() != 0;
}
public byte ReadByte() // ReadByte() is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
int val = m_stream.ReadByte(); // throws NotSupportedException, ObjectDisposedException
//Utils.ThrowException(val < 0 ? new EndOfStreamException() : null);
return (byte)val;
}
public sbyte ReadSByte()
{
return (sbyte)ReadByte();
}
private char ReadChar8()
{
return (char)ReadByte();
}
private char ReadChar16()
{
return BitConverter.ToChar(Read<ushort>(), 0);
}
public char ReadChar()
{
return ReadChar16();
}
public double ReadDouble()
{
return BitConverter.ToDouble(Read<double>(), 0);
}
public float ReadFloat()
{
return BitConverter.ToSingle(Read<float>(), 0);
}
public int ReadInt()
{
return BitConverter.ToInt32(Read<int>(), 0);
}
public uint ReadUInt()
{
return BitConverter.ToUInt32(Read<uint>(), 0);
}
public long ReadLong()
{
return BitConverter.ToInt64(Read<long>(), 0);
}
public ulong ReadULong()
{
return BitConverter.ToUInt64(Read<ulong>(), 0);
}
public short ReadShort()
{
return BitConverter.ToInt16(Read<short>(), 0);
}
public ushort ReadUShort()
{
return BitConverter.ToUInt16(Read<ushort>(), 0);
}
private string ReadString8()
{
int len = ReadInt();
if (len < 0) { return null; }
byte[] buffer = new byte[len];
m_stream.Read(buffer, 0, len); // throws IOException, NotSupportedException, ObjectDisposedException
return Encoding.ASCII.GetString(buffer);
}
private string ReadString16()
{
int len = ReadInt();
if (len < 0) { return null; }
byte[] buffer = new byte[len * 2];
m_stream.Read(buffer, 0, len * 2); // throws IOException, NotSupportedException, ObjectDisposedException
return Encoding.Unicode.GetString(buffer);
}
public string ReadString()
{
return ReadString16(); // throws exceptions (see ReadString16())
}
public Type ReadType()
{
string type_name = ReadString8(); // throws exceptions (see ReadString8())
//Utils.ThrowException(type_name == null ? new InvalidDataException() : null);
return Type.GetType(GetFullTypeName(type_name)); // throws TargetInvocationException, ArgumentException, TypeLoadException, FileNotFoundException, FileLoadException, BadImageFormatException
}
public ValueType ReadValue(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
//Utils.ThrowException(!type.IsValueType ? new InvalidArgumentValueException("type") : null);
if (type == typeof(bool))
{
return ReadBool();
}
else if (type == typeof(byte))
{
return ReadByte();
}
else if (type == typeof(sbyte))
{
return ReadSByte();
}
else if (type == typeof(char))
{
return ReadChar();
}
else if (type == typeof(double))
{
return ReadDouble();
}
else if (type == typeof(float))
{
return ReadFloat();
}
else if (type == typeof(int))
{
return ReadInt();
}
else if (type == typeof(uint))
{
return ReadUInt();
}
else if (type == typeof(long))
{
return ReadLong();
}
else if (type == typeof(ulong))
{
return ReadULong();
}
else if (type == typeof(short))
{
return ReadShort();
}
else if (type == typeof(ushort))
{
return ReadUShort();
}
else if (typeof(Latino.ISerializable).IsAssignableFrom(type))
{
ConstructorInfo cxtor = type.GetConstructor(new Type[] { typeof(Latino.BinarySerializer) });
//Utils.ThrowException(cxtor == null ? new ArgumentNotSupportedException("type") : null);
return (ValueType)cxtor.Invoke(new object[] { this }); // throws MemberAccessException, MethodAccessException, TargetInvocationException, NotSupportedException, SecurityException
}
else
{
//throw new ArgumentNotSupportedException("type");
throw new Exception("type");
}
}
public T ReadValue<T>()
{
return (T)(object)ReadValue(typeof(T)); // throws exceptions (see ReadValue(Type type))
}
public object ReadObject(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
switch (ReadByte())
{
case 0:
return null;
case 1:
break;
case 2:
Type type_0 = ReadType(); // throws exceptions (see ReadType())
//Utils.ThrowException(type_0 == null ? new TypeLoadException() : null);
//Utils.ThrowException(!type.IsAssignableFrom(type_0) ? new InvalidArgumentValueException("type") : null);
type = type_0;
break;
default:
throw new InvalidDataException();
}
if (type == typeof(string))
{
return ReadString();
}
else if (typeof(Latino.ISerializable).IsAssignableFrom(type))
{
ConstructorInfo cxtor = type.GetConstructor(new Type[] { typeof(Latino.BinarySerializer) });
//Utils.ThrowException(cxtor == null ? new ArgumentNotSupportedException("type") : null);
return cxtor.Invoke(new object[] { this }); // throws MemberAccessException, MethodAccessException, TargetInvocationException, NotSupportedException, SecurityException
}
else if (type.IsValueType)
{
return ReadValue(type); // throws exceptions (see ReadValue(Type type))
}
else
{
//throw new InvalidArgumentValueException("type");
throw new Exception("type");
}
}
public T ReadObject<T>()
{
return (T)ReadObject(typeof(T)); // throws exceptions (see ReadObject(Type type))
}
public object ReadValueOrObject(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
if (type.IsValueType)
{
return ReadValue(type); // throws exceptions (see ReadValue(Type type))
}
else
{
return ReadObject(type); // throws exceptions (see ReadObject(Type type))
}
}
public T ReadValueOrObject<T>()
{
return (T)ReadValueOrObject(typeof(T)); // throws exceptions (see ReadValueOrObject(Type type))
}
// *** Writing ***
private void Write(byte[] data) // Write(byte[] data) is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
m_stream.Write(data, 0, data.Length); // throws IOException, NotSupportedException, ObjectDisposedException
}
public void WriteBool(bool val)
{
WriteByte(val ? (byte)1 : (byte)0);
}
public void WriteByte(byte val) // WriteByte(byte val) is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
m_stream.WriteByte(val); // throws IOException, NotSupportedException, ObjectDisposedException
}
public void WriteSByte(sbyte val)
{
WriteByte((byte)val);
}
private void WriteChar8(char val)
{
WriteByte(Encoding.ASCII.GetBytes(new char[] { val })[0]);
}
private void WriteChar16(char val)
{
Write(BitConverter.GetBytes((ushort)val));
}
public void WriteChar(char val)
{
WriteChar16(val);
}
public void WriteDouble(double val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteFloat(float val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteInt(int val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteUInt(uint val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteLong(long val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteULong(ulong val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteShort(short val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteUShort(ushort val)
{
Write(BitConverter.GetBytes(val));
}
private void WriteString8(string val)
{
if (val == null) { WriteInt(-1); return; }
WriteInt(val.Length);
Write(Encoding.ASCII.GetBytes(val));
}
private void WriteString16(string val)
{
if (val == null) { WriteInt(-1); return; }
WriteInt(val.Length);
Write(Encoding.Unicode.GetBytes(val));
}
public void WriteString(string val)
{
WriteString16(val);
}
public void WriteValue(ValueType val)
{
if (val is bool)
{
WriteBool((bool)val);
}
else if (val is byte)
{
WriteByte((byte)val);
}
else if (val is sbyte)
{
WriteSByte((sbyte)val);
}
else if (val is char)
{
WriteChar((char)val);
}
else if (val is double)
{
WriteDouble((double)val);
}
else if (val is float)
{
WriteFloat((float)val);
}
else if (val is int)
{
WriteInt((int)val);
}
else if (val is uint)
{
WriteUInt((uint)val);
}
else if (val is long)
{
WriteLong((long)val);
}
else if (val is ulong)
{
WriteULong((ulong)val);
}
else if (val is short)
{
WriteShort((short)val);
}
else if (val is ushort)
{
WriteUShort((ushort)val);
}
else if (val is Latino.ISerializable)
{
((Latino.ISerializable)val).Save(this); // throws serialization-related exceptions
}
else
{
//throw new ArgumentTypeException("val");
}
}
public void WriteObject(Type type, object obj)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
//Utils.ThrowException((obj != null && !type.IsAssignableFrom(obj.GetType())) ? new ArgumentTypeException("obj") : null);
if (obj == null)
{
WriteByte(0);
}
else
{
Type obj_type = obj.GetType();
if (obj_type == type)
{
WriteByte(1);
}
else
{
WriteByte(2);
WriteType(obj_type);
}
if (obj is string)
{
WriteString((string)obj);
}
else if (obj is Latino.ISerializable)
{
((Latino.ISerializable)obj).Save(this); // throws serialization-related exceptions
}
else if (obj is ValueType)
{
WriteValue((ValueType)obj); // throws exceptions (see WriteValue(ValueType val))
}
else
{
//throw new ArgumentTypeException("obj");
}
}
}
public void WriteObject<T>(T obj)
{
WriteObject(typeof(T), obj); // throws exceptions (see WriteObject(Type type, object obj))
}
public void WriteValueOrObject(Type type, object obj)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
//Utils.ThrowException(!type.IsAssignableFrom(obj.GetType()) ? new ArgumentTypeException("obj") : null);
if (type.IsValueType)
{
WriteValue((ValueType)obj); // throws exceptions (see WriteValue(ValueType val))
}
else
{
WriteObject(type, obj); // throws exceptions (see WriteObject(Type type, object obj))
}
}
public void WriteValueOrObject<T>(T obj)
{
WriteValueOrObject(typeof(T), obj); // throws exceptions (see WriteValueOrObject(Type type, object obj))
}
public void WriteType(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
WriteString8(GetShortTypeName(type.AssemblyQualifiedName));
}
// *** Data directory ***
public string DataDir
{
get { return m_data_dir; }
set
{
//Utils.ThrowException(!Utils.VerifyPathName(value, /*must_exist=*/true) ? new InvalidArgumentValueException("DataDir") : null);
m_data_dir = value;
}
}
// *** Access to the associated stream ***
public void Close()
{
m_stream.Close();
}
public void Flush()
{
m_stream.Flush(); // throws IOException
}
public Stream Stream
{
get { return m_stream; }
}
}
}
#endif

@ -0,0 +1,165 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>9.0.21022</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{A39293C1-92D8-47B9-93A4-41F443B4F9E4}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>LemmaSharp</RootNamespace>
<AssemblyName>LemmaSharp</AssemblyName>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<IsWebBootstrapper>true</IsWebBootstrapper>
<StartupObject>
</StartupObject>
<FileUpgradeFlags>
</FileUpgradeFlags>
<UpgradeBackupLocation>
</UpgradeBackupLocation>
<OldToolsVersion>3.5</OldToolsVersion>
<TargetFrameworkProfile />
<PublishUrl>http://localhost/LemmaSharp/</PublishUrl>
<Install>true</Install>
<InstallFrom>Web</InstallFrom>
<UpdateEnabled>true</UpdateEnabled>
<UpdateMode>Foreground</UpdateMode>
<UpdateInterval>7</UpdateInterval>
<UpdateIntervalUnits>Days</UpdateIntervalUnits>
<UpdatePeriodically>false</UpdatePeriodically>
<UpdateRequired>false</UpdateRequired>
<MapFileExtensions>true</MapFileExtensions>
<ApplicationRevision>0</ApplicationRevision>
<ApplicationVersion>1.0.0.%2a</ApplicationVersion>
<UseApplicationTrust>false</UseApplicationTrust>
<BootstrapperEnabled>true</BootstrapperEnabled>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>TRACE;DEBUG;NOLATINO</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE;NOLATINO</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x86' ">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x86\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x86</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' ">
<OutputPath>bin\x86\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x86</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x64' ">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x64' ">
<OutputPath>bin\x64\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<ItemGroup>
<Reference Include="Lzma#, Version=4.12.3884.11200, Culture=neutral, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>ExternalLibs\Lzma#.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Data" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="LatinoCompatibility\BinarySerializer.cs">
<SubType>Code</SubType>
</Compile>
<Compile Include="Interfaces\ILemmatizer.cs" />
<Compile Include="Interfaces\ILemmatizerModel.cs" />
<Compile Include="Interfaces\ILemmatizerTrainable.cs" />
<Compile Include="Classes\LemmatizerSettings.cs" />
<Compile Include="Classes\LemmaRule.cs" />
<Compile Include="Classes\Lemmatizer.cs" />
<Compile Include="Classes\LemmaTreeNode.cs" />
<Compile Include="Classes\LemmaExample.cs" />
<Compile Include="Classes\ExampleList.cs" />
<Compile Include="Classes\RuleList.cs" />
<Compile Include="Classes\RuleWeighted.cs" />
</ItemGroup>
<ItemGroup>
<BootstrapperPackage Include="Microsoft.Net.Client.3.5">
<Visible>False</Visible>
<ProductName>.NET Framework Client Profile</ProductName>
<Install>false</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.2.0">
<Visible>False</Visible>
<ProductName>.NET Framework 2.0 %28x86%29</ProductName>
<Install>true</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.0">
<Visible>False</Visible>
<ProductName>.NET Framework 3.0 %28x86%29</ProductName>
<Install>false</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5</ProductName>
<Install>false</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5 SP1</ProductName>
<Install>false</Install>
</BootstrapperPackage>
</ItemGroup>
<ItemGroup>
<Folder Include="Properties\" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

@ -0,0 +1,28 @@
namespace LemmaSharp
{
public enum LanguagePrebuilt
{
//from Multext-East v4 lexicons
Bulgarian,
Czech,
English,
Estonian,
Persian,
French,
Hungarian,
Macedonian,
Polish,
Romanian,
Russian,
Slovak,
Slovene,
Serbian,
Ukrainian,
//from Multext lexicons
EnglishMT,
FrenchMT,
German,
Italian,
Spanish,
}
}

@ -0,0 +1,118 @@
using System;
using System.IO;
using System.Reflection;
using System.Runtime.Serialization;
namespace LemmaSharp
{
[Serializable]
public abstract class LemmatizerPrebuilt : Lemmatizer
{
#region Private Variables
private static string[] asLangMapping = new string[] {
"bg", "mlteast",
"cs", "mlteast",
"en", "mlteast",
"et", "mlteast",
"fa", "mlteast",
"fr", "mlteast",
"hu", "mlteast",
"mk", "mlteast",
"pl", "mlteast",
"ro", "mlteast",
"ru", "mlteast",
"sk", "mlteast",
"sl", "mlteast",
"sr", "mlteast",
"uk", "mlteast",
"en", "multext",
"fr", "multext",
"ge", "multext",
"it", "multext",
"sp", "multext",
};
private LanguagePrebuilt lang;
#endregion
#region Constructor(s)
public LemmatizerPrebuilt(LanguagePrebuilt lang)
: base()
{
this.lang = lang;
}
public LemmatizerPrebuilt(LanguagePrebuilt lang, LemmatizerSettings lsett)
: base(lsett)
{
this.lang = lang;
}
#endregion
#region Private Properties Helping Functions
protected string GetResourceFileName(string sFileMask)
{
return GetResourceFileName(sFileMask, lang);
}
public static string GetResourceFileName(string sFileMask, LanguagePrebuilt lang)
{
string langFileName = asLangMapping[(int)lang * 2 + 1] + '-' + asLangMapping[(int)lang * 2];
return string.Format(sFileMask, langFileName);
}
#endregion
#region Public Properties
public LanguagePrebuilt Language
{
get
{
return lang;
}
}
public LexiconPrebuilt Lexicon
{
get
{
return GetLexicon(lang);
}
}
#endregion
#region Public Properties
public static LexiconPrebuilt GetLexicon(LanguagePrebuilt lang)
{
return (LexiconPrebuilt)Enum.Parse(typeof(LexiconPrebuilt), asLangMapping[((int)lang) * 2 + 1], true);
}
#endregion
#region Resource Management Functions
protected abstract Assembly GetExecutingAssembly();
protected Stream GetResourceStream(string sResourceShortName)
{
var assembly = GetExecutingAssembly();
string sResourceName = null;
foreach (var sResource in assembly.GetManifestResourceNames())
{
if (sResource.EndsWith(sResourceShortName))
{
sResourceName = sResource;
break;
}
}
if (String.IsNullOrEmpty(sResourceName))
return null;
return assembly.GetManifestResourceStream(sResourceName);
}
#endregion
#region Serialization Functions
public LemmatizerPrebuilt(SerializationInfo info, StreamingContext context)
: base(info, context)
{
}
#endregion
}
}

@ -0,0 +1,29 @@
using System;
using System.IO;
using System.Reflection;
namespace LemmaSharp
{
[Serializable]
public class LemmatizerPrebuiltCompact : LemmatizerPrebuilt
{
public const string FILEMASK = "compact7z-{0}.lem";
#region Constructor(s) & Destructor(s)
public LemmatizerPrebuiltCompact(LanguagePrebuilt lang)
: base(lang)
{
Stream stream = GetResourceStream(GetResourceFileName(FILEMASK));
this.Deserialize(stream);
stream.Close();
}
#endregion
#region Resource Management Functions
protected override Assembly GetExecutingAssembly()
{
return Assembly.GetExecutingAssembly();
}
#endregion
}
}

@ -0,0 +1,8 @@
namespace LemmaSharp
{
public enum LexiconPrebuilt
{
MltEast,
Multext
}
}

@ -0,0 +1,132 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>9.0.21022</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{1E700D21-62D3-4525-93FE-C1FB0A1B0564}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>LemmaSharp</RootNamespace>
<AssemblyName>LemmaSharpPrebuilt</AssemblyName>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<FileUpgradeFlags>
</FileUpgradeFlags>
<UpgradeBackupLocation>
</UpgradeBackupLocation>
<OldToolsVersion>3.5</OldToolsVersion>
<PublishUrl>publish\</PublishUrl>
<Install>true</Install>
<InstallFrom>Disk</InstallFrom>
<UpdateEnabled>false</UpdateEnabled>
<UpdateMode>Foreground</UpdateMode>
<UpdateInterval>7</UpdateInterval>
<UpdateIntervalUnits>Days</UpdateIntervalUnits>
<UpdatePeriodically>false</UpdatePeriodically>
<UpdateRequired>false</UpdateRequired>
<MapFileExtensions>true</MapFileExtensions>
<ApplicationRevision>0</ApplicationRevision>
<ApplicationVersion>1.0.0.%2a</ApplicationVersion>
<IsWebBootstrapper>false</IsWebBootstrapper>
<UseApplicationTrust>false</UseApplicationTrust>
<BootstrapperEnabled>true</BootstrapperEnabled>
<TargetFrameworkProfile />
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x86' ">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x86\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x86</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' ">
<OutputPath>bin\x86\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x86</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x64' ">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x64' ">
<OutputPath>bin\x64\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<ItemGroup>
</ItemGroup>
<ItemGroup>
<Compile Include="Classes\LanguagePrebuilt.cs" />
<Compile Include="Classes\LemmatizerPrebuilt.cs" />
<Compile Include="Classes\LexiconPrebuilt.cs" />
</ItemGroup>
<ItemGroup>
<Folder Include="Data\" />
<Folder Include="Properties\" />
</ItemGroup>
<ItemGroup>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5 SP1</ProductName>
<Install>true</Install>
</BootstrapperPackage>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\LemmaSharp\LemmaSharp.csproj">
<Project>{a39293c1-92d8-47b9-93a4-41f443b4f9e4}</Project>
<Name>LemmaSharp</Name>
</ProjectReference>
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

@ -0,0 +1,111 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>9.0.21022</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>LemmaSharpPrebuiltCompact</RootNamespace>
<AssemblyName>LemmaSharpPrebuiltCompact</AssemblyName>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<FileUpgradeFlags>
</FileUpgradeFlags>
<UpgradeBackupLocation>
</UpgradeBackupLocation>
<OldToolsVersion>3.5</OldToolsVersion>
<TargetFrameworkProfile />
<PublishUrl>publish\</PublishUrl>
<Install>true</Install>
<InstallFrom>Disk</InstallFrom>
<UpdateEnabled>false</UpdateEnabled>
<UpdateMode>Foreground</UpdateMode>
<UpdateInterval>7</UpdateInterval>
<UpdateIntervalUnits>Days</UpdateIntervalUnits>
<UpdatePeriodically>false</UpdatePeriodically>
<UpdateRequired>false</UpdateRequired>
<MapFileExtensions>true</MapFileExtensions>
<ApplicationRevision>0</ApplicationRevision>
<ApplicationVersion>1.0.0.%2a</ApplicationVersion>
<IsWebBootstrapper>false</IsWebBootstrapper>
<UseApplicationTrust>false</UseApplicationTrust>
<BootstrapperEnabled>true</BootstrapperEnabled>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<ItemGroup>
</ItemGroup>
<ItemGroup>
<Compile Include="Classes\LemmatizerPrebuiltCompact.cs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\LemmaSharp\LemmaSharp.csproj">
<Project>{A39293C1-92D8-47B9-93A4-41F443B4F9E4}</Project>
<Name>LemmaSharp</Name>
</ProjectReference>
<ProjectReference Include="LemmaSharpPrebuilt.csproj">
<Project>{1E700D21-62D3-4525-93FE-C1FB0A1B0564}</Project>
<Name>LemmaSharpPrebuilt</Name>
</ProjectReference>
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="Data\compact7z-mlteast-bg.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-cs.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-en.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-et.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-fa.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-fr.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-hu.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-mk.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-pl.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-ro.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-ru.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-sk.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-sl.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-sr.lem" />
<EmbeddedResource Include="Data\compact7z-mlteast-uk.lem" />
<EmbeddedResource Include="Data\compact7z-multext-en.lem" />
<EmbeddedResource Include="Data\compact7z-multext-fr.lem" />
<EmbeddedResource Include="Data\compact7z-multext-ge.lem" />
<EmbeddedResource Include="Data\compact7z-multext-it.lem" />
<EmbeddedResource Include="Data\compact7z-multext-sp.lem" />
</ItemGroup>
<ItemGroup>
<Folder Include="Properties\" />
</ItemGroup>
<ItemGroup>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5 SP1</ProductName>
<Install>true</Install>
</BootstrapperPackage>
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

@ -0,0 +1,58 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25420.1
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaSharp", "LemmaSharp\LemmaSharp.csproj", "{A39293C1-92D8-47B9-93A4-41F443B4F9E4}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaSharpPrebuiltCompact", "LemmaSharpPrebuilt\LemmaSharpPrebuiltCompact.csproj", "{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaSharpPrebuilt", "LemmaSharpPrebuilt\LemmaSharpPrebuilt.csproj", "{1E700D21-62D3-4525-93FE-C1FB0A1B0564}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|Any CPU = Release|Any CPU
Release|x64 = Release|x64
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x64.ActiveCfg = Debug|x64
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x64.Build.0 = Debug|x64
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x86.ActiveCfg = Debug|x86
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x86.Build.0 = Debug|x86
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|Any CPU.Build.0 = Release|Any CPU
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x64.ActiveCfg = Release|x64
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x64.Build.0 = Release|x64
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x86.ActiveCfg = Release|x86
{A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x86.Build.0 = Release|x86
{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}.Debug|x64.ActiveCfg = Debug|Any CPU
{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}.Debug|x86.ActiveCfg = Debug|Any CPU
{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}.Release|Any CPU.Build.0 = Release|Any CPU
{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}.Release|x64.ActiveCfg = Release|Any CPU
{9BA3F2C4-5DAB-4D7B-B431-B072A0D8FC6A}.Release|x86.ActiveCfg = Release|Any CPU
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Debug|Any CPU.Build.0 = Debug|Any CPU
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Debug|x64.ActiveCfg = Debug|x64
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Debug|x64.Build.0 = Debug|x64
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Debug|x86.ActiveCfg = Debug|x86
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Debug|x86.Build.0 = Debug|x86
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Release|Any CPU.ActiveCfg = Release|Any CPU
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Release|Any CPU.Build.0 = Release|Any CPU
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Release|x64.ActiveCfg = Release|x64
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Release|x64.Build.0 = Release|x64
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Release|x86.ActiveCfg = Release|x86
{1E700D21-62D3-4525-93FE-C1FB0A1B0564}.Release|x86.Build.0 = Release|x86
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

@ -0,0 +1,381 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.Serialization;
using System.Text;
namespace LemmaSharp
{
[Serializable]
public class ExampleList : ISerializable
{
#region Private Variables
private LemmatizerSettings lsett;
private RuleList rlRules;
private Dictionary<string, LemmaExample> dictExamples;
private List<LemmaExample> lstExamples;
#endregion
#region Constructor(s)
public ExampleList(LemmatizerSettings lsett) : base()
{
this.lsett = lsett;
this.dictExamples = new Dictionary<string, LemmaExample>();
this.lstExamples = null;
this.rlRules = new RuleList(lsett);
}
public ExampleList(StreamReader srIn, string sFormat, LemmatizerSettings lsett) : this(lsett)
{
AddMultextFile(srIn, sFormat);
}
#endregion
#region Public Properties & Indexers
public LemmaExample this[int i]
{
get
{
if (lstExamples == null)
FinalizeAdditions();
return lstExamples[i];
}
}
public int Count
{
get
{
if (lstExamples == null)
FinalizeAdditions();
return lstExamples.Count;
}
}
public double WeightSum
{
get
{
if (lstExamples == null)
FinalizeAdditions();
double dWeight = 0;
foreach (LemmaExample exm in lstExamples)
dWeight += exm.Weight;
return dWeight;
}
}
public RuleList Rules
{
get
{
return rlRules;
}
}
public List<LemmaExample> ListExamples
{
get
{
if (lstExamples == null)
FinalizeAdditions();
return lstExamples;
}
}
#endregion
#region Essential Class Functions (adding/removing examples)
public void AddMultextFile(StreamReader srIn, string sFormat)
{
//read from file
string sLine = null;
int iError = 0;
int iLine = 0;
var iW = sFormat.IndexOf('W');
var iL = sFormat.IndexOf('L');
var iM = sFormat.IndexOf('M');
var iF = sFormat.IndexOf('F');
var iLen = Math.Max(Math.Max(iW, iL), Math.Max(iM, iF)) + 1;
if (iW < 0 || iL < 0)
{
throw new Exception("Can not find word and lemma location in the format specification");
}
while ((sLine = srIn.ReadLine()) != null && iError < 50)
{
iLine++;
string[] asWords = sLine.Split(new char[] { '\t' });
if (asWords.Length < iLen)
{
//Console.WriteLine("ERROR: Line doesn't confirm to the given format \"" + sFormat + "\"! Line " + iLine.ToString() + ".");
iError++;
continue;
}
var sWord = asWords[iW];
var sLemma = asWords[iL];
if (sLemma.Equals("=", StringComparison.Ordinal))
sLemma = sWord;
string sMsd = null;
if (iM > -1)
sMsd = asWords[iM];
double dWeight = 1; ;
if (iF > -1)
Double.TryParse(asWords[iM], out dWeight);
AddExample(sWord, sLemma, dWeight, sMsd);
}
if (iError == 50)
throw new Exception("Parsing stopped because of too many (50) errors. Check format specification");
}
public LemmaExample AddExample(string sWord, string sLemma, double dWeight, string sMsd)
{
string sNewMsd = lsett.eMsdConsider != LemmatizerSettings.MsdConsideration.Ignore
? sMsd
: null;
var leNew = new LemmaExample(sWord, sLemma, dWeight, sNewMsd, rlRules, lsett);
return Add(leNew);
}
private LemmaExample Add(LemmaExample leNew)
{
LemmaExample leReturn = null;
if (!dictExamples.TryGetValue(leNew.Signature, out leReturn))
{
leReturn = leNew;
dictExamples.Add(leReturn.Signature, leReturn);
}
else
leReturn.Join(leNew);
lstExamples = null;
return leReturn;
}
public void DropExamples()
{
dictExamples.Clear();
lstExamples = null;
}
public void FinalizeAdditions()
{
if (lstExamples != null)
return;
lstExamples = new List<LemmaExample>(dictExamples.Values);
lstExamples.Sort();
}
public ExampleList GetFrontRearExampleList(bool front)
{
var elExamplesNew = new ExampleList(lsett);
foreach (var le in this.ListExamples)
{
if (front)
elExamplesNew.AddExample(le.WordFront, le.LemmaFront, le.Weight, le.Msd);
else
elExamplesNew.AddExample(le.WordRear, le.LemmaRear, le.Weight, le.Msd);
}
elExamplesNew.FinalizeAdditions();
return elExamplesNew;
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
var sb = new StringBuilder();
foreach (var exm in lstExamples)
{
sb.AppendLine(exm.ToString());
}
return sb.ToString();
}
#endregion
#region Serialization Functions (.Net Default - ISerializable)
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
info.AddValue("lsett", lsett);
info.AddValue("iNumExamples", dictExamples.Count);
var aWords = new string[dictExamples.Count];
var aLemmas = new string[dictExamples.Count];
var aWeights = new double[dictExamples.Count];
var aMsds = new string[dictExamples.Count];
int iExm = 0;
foreach (var exm in dictExamples.Values)
{
aWords[iExm] = exm.Word;
aLemmas[iExm] = exm.Lemma;
aWeights[iExm] = exm.Weight;
aMsds[iExm] = exm.Msd;
iExm++;
}
info.AddValue("aWords", aWords);
info.AddValue("aLemmas", aLemmas);
info.AddValue("aWeights", aWeights);
info.AddValue("aMsds", aMsds);
}
public ExampleList(SerializationInfo info, StreamingContext context)
{
lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings));
this.dictExamples = new Dictionary<string, LemmaExample>();
this.lstExamples = null;
this.rlRules = new RuleList(lsett);
var aWords = (string[])info.GetValue("aWords", typeof(string[]));
var aLemmas = (string[])info.GetValue("aLemmas", typeof(string[]));
var aWeights = (double[])info.GetValue("aWeights", typeof(double[]));
var aMsds = (string[])info.GetValue("aMsds", typeof(string[]));
for (int iExm = 0; iExm < aWords.Length; iExm++)
AddExample(aWords[iExm], aLemmas[iExm], aWeights[iExm], aMsds[iExm]);
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bSerializeExamples, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Serialize(binWrt);
rlRules.Serialize(binWrt, false);
if (!bSerializeExamples)
{
binWrt.Write(false); // lstExamples == null
binWrt.Write(0); // dictExamples.Count == 0
}
else
{
if (lstExamples == null)
{
binWrt.Write(false); // lstExamples == null
//save dictionary items
int iCount = dictExamples.Count;
binWrt.Write(iCount);
foreach (var kvp in dictExamples)
{
binWrt.Write(kvp.Value.Rule.Signature);
kvp.Value.Serialize(binWrt, false);
}
}
else
{
binWrt.Write(true); // lstExamples != null
//save list & dictionary items
var iCount = lstExamples.Count;
binWrt.Write(iCount);
foreach (var le in lstExamples)
{
binWrt.Write(le.Rule.Signature);
le.Serialize(binWrt, false);
}
}
}
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
rlRules = new RuleList(binRead, this.lsett);
var bCreateLstExamples = binRead.ReadBoolean();
lstExamples = bCreateLstExamples ? new List<LemmaExample>() : null;
dictExamples = new Dictionary<string, LemmaExample>();
//load dictionary items
var iCount = binRead.ReadInt32();
for (var iId = 0; iId < iCount; iId++)
{
var lrRule = rlRules[binRead.ReadString()];
var le = new LemmaExample(binRead, this.lsett, lrRule);
dictExamples.Add(le.Signature, le);
if (bCreateLstExamples)
lstExamples.Add(le);
}
}
public ExampleList(BinaryReader binRead, LemmatizerSettings lsett)
{
Deserialize(binRead, lsett);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bSerializeExamples, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Save(binWrt);
rlRules.Save(binWrt, false);
if (!bSerializeExamples) {
binWrt.WriteBool(false); // lstExamples == null
binWrt.WriteInt(0); // dictExamples.Count == 0
}
else {
if (lstExamples == null) {
binWrt.WriteBool(false); // lstExamples == null
//save dictionary items
int iCount = dictExamples.Count;
binWrt.WriteInt(iCount);
foreach (KeyValuePair<string, LemmaExample> kvp in dictExamples) {
binWrt.WriteString(kvp.Value.Rule.Signature);
kvp.Value.Save(binWrt, false);
}
}
else {
binWrt.WriteBool(true); // lstExamples != null
//save list & dictionary items
int iCount = lstExamples.Count;
binWrt.WriteInt(iCount);
foreach (LemmaExample le in lstExamples) {
binWrt.WriteString(le.Rule.Signature);
le.Save(binWrt, false);
}
}
}
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
rlRules = new RuleList(binRead, this.lsett);
bool bCreateLstExamples = binRead.ReadBool();
lstExamples = bCreateLstExamples ? new List<LemmaExample>() : null;
dictExamples = new Dictionary<string, LemmaExample>();
//load dictionary items
int iCount = binRead.ReadInt();
for (int iId = 0; iId < iCount; iId++) {
LemmaRule lrRule = rlRules[binRead.ReadString()];
LemmaExample le = new LemmaExample(binRead, this.lsett, lrRule);
dictExamples.Add(le.Signature, le);
if (bCreateLstExamples) lstExamples.Add(le);
}
}
public ExampleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
#endregion
}
}

@ -0,0 +1,481 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace LemmaSharp
{
public class LemmaExample : IComparable<LemmaExample>, IComparer<LemmaExample>
{
#region Private Variables
private string sWord;
private string sLemma;
private string sSignature;
private string sMsd;
private double dWeight;
private LemmaRule lrRule;
private LemmatizerSettings lsett;
private string sWordRearCache;
private string sWordFrontCache;
private string sLemmaFrontCache;
#endregion
#region Constructor(s)
public LemmaExample(string sWord, string sLemma, double dWeight, string sMsd, RuleList rlRules, LemmatizerSettings lsett)
{
this.lsett = lsett;
this.sWord = sWord;
this.sLemma = sLemma;
this.sMsd = sMsd;
this.dWeight = dWeight;
this.lrRule = rlRules.AddRule(this);
switch (lsett.eMsdConsider)
{
case LemmatizerSettings.MsdConsideration.Ignore:
case LemmatizerSettings.MsdConsideration.JoinAll:
case LemmatizerSettings.MsdConsideration.JoinDistinct:
case LemmatizerSettings.MsdConsideration.JoinSameSubstring:
sSignature = string.Format("[{0}]==>[{1}]", sWord, sLemma);
break;
case LemmatizerSettings.MsdConsideration.Distinct:
default:
sSignature = string.Format("[{0}]==>[{1}]({2})", sWord, sLemma, sMsd ?? "");
break;
}
this.sWordRearCache = null;
this.sWordFrontCache = null;
this.sLemmaFrontCache = null;
}
#endregion
#region Public Properties
public string Word
{
get
{
return sWord;
}
}
public string Lemma
{
get
{
return sLemma;
}
}
public string Msd
{
get
{
return sMsd;
}
}
public string Signature
{
get
{
return sSignature;
}
}
public double Weight
{
get
{
return dWeight;
}
}
public LemmaRule Rule
{
get
{
return lrRule;
}
}
/// <summary>
/// Word to be pre-lemmatized with Front-Lemmatizer into LemmaFront which is then lemmatized by standard Rear-Lemmatizer (Warning it is reversed)
/// </summary>
public string WordFront
{
get
{
if (sWordFrontCache == null)
sWordFrontCache = StringReverse(sWord);
return sWordFrontCache;
}
}
/// <summary>
/// Lemma to be produced by pre-lemmatizing with Front-Lemmatizer (Warning it is reversed)
/// </summary>
public string LemmaFront
{
get
{
if (sLemmaFrontCache == null)
sLemmaFrontCache = StringReverse(WordRear);
return sLemmaFrontCache;
}
}
/// <summary>
/// word to be lemmatized by standard Rear-Lemmatizer (it's beggining has been already modified by Front-Lemmatizer)
/// </summary>
public string WordRear
{
get
{
if (sWordRearCache == null)
{
int lemmaPos = 0, wordPos = 0;
var common = LongestCommonSubstring(sWord, sLemma, ref wordPos, ref lemmaPos);
sWordRearCache = lemmaPos == -1 ? sLemma : (sLemma.Substring(0, lemmaPos + common.Length) + sWord.Substring(wordPos + common.Length));
}
return sWordRearCache;
}
}
/// <summary>
/// lemma to be produced by standard Rear-Lemmatizer from WordRear
/// </summary>
public string LemmaRear
{
get
{
return sLemma;
}
}
#endregion
#region Essential Class Functions (joining two examples into one)
//TODO - this function is not totaly ok because sMsd should not be
//changed since it could be included in signature
public void Join(LemmaExample leJoin)
{
dWeight += leJoin.dWeight;
if (sMsd != null)
switch (lsett.eMsdConsider)
{
case LemmatizerSettings.MsdConsideration.Ignore:
sMsd = null;
break;
case LemmatizerSettings.MsdConsideration.Distinct:
break;
case LemmatizerSettings.MsdConsideration.JoinAll:
sMsd += "|" + leJoin.sMsd;
break;
case LemmatizerSettings.MsdConsideration.JoinDistinct:
var append = string.Format("|{0}", leJoin.sMsd);
if (false == sMsd.Equals(leJoin.sMsd, StringComparison.Ordinal) &&
sMsd.IndexOf(append) < 0)
{
sMsd += append;
}
break;
case LemmatizerSettings.MsdConsideration.JoinSameSubstring:
int iPos = 0;
var iMax = Math.Min(sMsd.Length, leJoin.sMsd.Length);
while (iPos < iMax && sMsd[iPos] == leJoin.sMsd[iPos])
iPos++;
sMsd = sMsd.Substring(0, iPos);
break;
default:
break;
}
}
#endregion
#region Essential Class Functions (calculating similarities betwen examples)
public int Similarity(LemmaExample le)
{
return Similarity(this, le);
}
public static int Similarity(LemmaExample le1, LemmaExample le2)
{
var sWord1 = le1.sWord;
var sWord2 = le2.sWord;
var iLen1 = sWord1.Length;
var iLen2 = sWord2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
for (var iPos = 1; iPos <= iMaxLen; iPos++)
{
if (sWord1[iLen1 - iPos] != sWord2[iLen2 - iPos])
return iPos - 1;
}
//TODO similarity should be bigger if two words are totaly equal
//if (sWord1 == sWord2)
// return iMaxLen + 1;
//else
return iMaxLen;
}
#endregion
#region Essential Class Functions (comparing examples - eg.: for sorting)
/// <summary>
/// Function used to comprare current MultextExample (ME) against argument ME.
/// Mainly used in for sorting lists of MEs.
/// </summary>
/// <param name="other"> MultextExample (ME) that we compare current ME against.</param>
/// <returns>1 if current ME is bigger, -1 if smaler and 0 if both are the same.</returns>
public int CompareTo(LemmaExample other)
{
var iComparison = CompareStrings(this.sWord, other.sWord, false);
if (iComparison != 0)
return iComparison;
iComparison = CompareStrings(this.sLemma, other.sLemma, true);
if (iComparison != 0)
return iComparison;
if (lsett.eMsdConsider == LemmatizerSettings.MsdConsideration.Distinct &&
this.sMsd != null && other.sMsd != null)
{
iComparison = CompareStrings(this.sMsd, other.sMsd, true);
if (iComparison != 0)
return iComparison;
}
return 0;
}
public int Compare(LemmaExample x, LemmaExample y)
{
return x.CompareTo(y);
}
public static int CompareStrings(string sStr1, string sStr2, bool bForward)
{
var iLen1 = sStr1.Length;
var iLen2 = sStr2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
if (bForward)
{
for (int iPos = 0; iPos < iMaxLen; iPos++)
{
if (sStr1[iPos] > sStr2[iPos])
return 1;
if (sStr1[iPos] < sStr2[iPos])
return -1;
}
}
else
{
for (int iPos = 1; iPos <= iMaxLen; iPos++)
{
if (sStr1[iLen1 - iPos] > sStr2[iLen2 - iPos])
return 1;
if (sStr1[iLen1 - iPos] < sStr2[iLen2 - iPos])
return -1;
}
}
if (iLen1 > iLen2)
return 1;
if (iLen1 < iLen2)
return -1;
return 0;
}
public static int EqualPrifixLen(string sStr1, string sStr2)
{
var iLen1 = sStr1.Length;
var iLen2 = sStr2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
for (var iPos = 0; iPos < iMaxLen; iPos++)
{
if (sStr1[iPos] != sStr2[iPos])
return iPos;
}
return iMaxLen;
}
public static string LongestCommonSubstring(string sStr1, string sStr2, ref int iPosInStr1, ref int iPosInStr2)
{
var l = new int[sStr1.Length + 1, sStr2.Length + 1];
int z = 0;
string ret = "";
iPosInStr1 = -1;
iPosInStr2 = -1;
for (var i = 0; i < sStr1.Length; i++)
{
for (var j = 0; j < sStr2.Length; j++)
{
if (sStr1[i] == sStr2[j])
{
if (i == 0 || j == 0)
{
l[i, j] = 1;
}
else
{
l[i, j] = l[i - 1, j - 1] + 1;
}
if (l[i, j] > z)
{
z = l[i, j];
iPosInStr1 = i - z + 1;
iPosInStr2 = j - z + 1;
ret = sStr1.Substring(i - z + 1, z);
}
}
}
}
return ret;
}
public static string StringReverse(string s)
{
if (s == null) return null;
char[] charArray = s.ToCharArray();
int len = s.Length - 1;
for (int i = 0; i < len; i++, len--)
{
charArray[i] ^= charArray[len];
charArray[len] ^= charArray[i];
charArray[i] ^= charArray[len];
}
return new string(charArray);
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
var sb = new StringBuilder();
if (sWord != null)
sb.AppendFormat("W:\"{0}\" ", sWord);
if (sLemma != null)
sb.AppendFormat("L:\"{0}\" ", sLemma);
if (sMsd != null)
sb.AppendFormat("M:\"{0}\" ", sMsd);
if (false == Double.IsNaN(dWeight))
sb.AppendFormat("F:\"{0}\" ", dWeight);
if (lrRule != null)
sb.AppendFormat("R:{0} ", lrRule);
if (sb.Length > 0)
return sb.ToString(0, sb.Length - 1);
return string.Empty;
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
binWrt.Write(sWord);
binWrt.Write(sLemma);
binWrt.Write(sSignature);
if (sMsd == null)
{
binWrt.Write(false);
}
else
{
binWrt.Write(true);
binWrt.Write(sMsd);
}
binWrt.Write(dWeight);
//save refernce types if needed -------------------------
if (bThisTopObject)
{
lsett.Serialize(binWrt);
lrRule.Serialize(binWrt, false);
}
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, LemmaRule lrRule)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
sWord = binRead.ReadString();
sLemma = binRead.ReadString();
sSignature = binRead.ReadString();
if (binRead.ReadBoolean())
sMsd = binRead.ReadString();
else
sMsd = null;
dWeight = binRead.ReadDouble();
//load refernce types if needed -------------------------
if (bThisTopObject)
{
this.lsett = new LemmatizerSettings(binRead);
this.lrRule = new LemmaRule(binRead, this.lsett);
}
else
{
this.lsett = lsett;
this.lrRule = lrRule;
}
this.sWordRearCache = null;
this.sWordFrontCache = null;
this.sLemmaFrontCache = null;
}
public LemmaExample(BinaryReader binRead, LemmatizerSettings lsett, LemmaRule lrRule)
{
Deserialize(binRead, lsett, lrRule);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
binWrt.WriteString(sWord);
binWrt.WriteString(sLemma);
binWrt.WriteString(sSignature);
if (sMsd == null)
binWrt.WriteBool(false);
else {
binWrt.WriteBool(true);
binWrt.WriteString(sMsd);
}
binWrt.WriteDouble(dWeight);
//save refernce types if needed -------------------------
if (bThisTopObject) {
lsett.Save(binWrt);
lrRule.Save(binWrt, false);
}
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
sWord = binRead.ReadString();
sLemma = binRead.ReadString();
sSignature = binRead.ReadString();
if (binRead.ReadBool())
sMsd = binRead.ReadString();
else
sMsd = null;
dWeight = binRead.ReadDouble();
//load refernce types if needed -------------------------
if (bThisTopObject) {
this.lsett = new LemmatizerSettings(binRead);
this.lrRule = new LemmaRule(binRead, this.lsett);
}
else {
this.lsett = lsett;
this.lrRule = lrRule;
}
}
public LemmaExample(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule) {
Load(binRead, lsett, lrRule);
}
#endif
#endregion
}
}

@ -0,0 +1,189 @@
using System;
using System.IO;
namespace LemmaSharp
{
public class LemmaRule
{
#region Private Variables
private int iId;
private int iFrom;
private string sFrom;
private string sTo;
private string sSignature;
private LemmatizerSettings lsett;
#endregion
#region Constructor(s)
public LemmaRule(string sWord, string sLemma, int iId, LemmatizerSettings lsett)
{
this.lsett = lsett;
this.iId = iId;
int iSameStem = SameStem(sWord, sLemma);
sTo = sLemma.Substring(iSameStem);
iFrom = sWord.Length - iSameStem;
if (lsett.bUseFromInRules)
{
sFrom = sWord.Substring(iSameStem);
sSignature = string.Format("[{0}]==>[{1}]", sFrom, sTo);
}
else
{
sFrom = null;
sSignature = string.Format("[#{0}]==>[{1}]", iFrom, sTo);
}
}
#endregion
#region Public Properties
public string Signature
{
get
{
return sSignature;
}
}
public int Id
{
get
{
return iId;
}
}
#endregion
#region Essential Class Functions
private static int SameStem(string sStr1, string sStr2)
{
var iLen1 = sStr1.Length;
var iLen2 = sStr2.Length;
var iMaxLen = Math.Min(iLen1, iLen2);
for (var iPos = 0; iPos < iMaxLen; iPos++)
{
if (sStr1[iPos] != sStr2[iPos])
return iPos;
}
return iMaxLen;
}
public bool IsApplicableToGroup(int iGroupCondLen)
{
return iGroupCondLen >= iFrom;
}
public string Lemmatize(string sWord)
{
return sWord.Substring(0, sWord.Length - iFrom) + sTo;
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
return string.Format("{0}:{1}", iId, sSignature);
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
binWrt.Write(iId);
binWrt.Write(iFrom);
if (sFrom == null)
binWrt.Write(false);
else
{
binWrt.Write(true);
binWrt.Write(sFrom);
}
binWrt.Write(sTo);
binWrt.Write(sSignature);
if (bThisTopObject)
lsett.Serialize(binWrt);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
iId = binRead.ReadInt32();
iFrom = binRead.ReadInt32();
if (binRead.ReadBoolean())
{
sFrom = binRead.ReadString();
}
else
{
sFrom = null;
}
sTo = binRead.ReadString();
sSignature = binRead.ReadString();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
}
public LemmaRule(System.IO.BinaryReader binRead, LemmatizerSettings lsett)
{
this.Deserialize(binRead, lsett);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
binWrt.WriteInt(iId);
binWrt.WriteInt(iFrom);
if (sFrom == null)
binWrt.WriteBool(false);
else {
binWrt.WriteBool(true);
binWrt.WriteString(sFrom);
}
binWrt.WriteString(sTo);
binWrt.WriteString(sSignature);
if (bThisTopObject)
lsett.Save(binWrt);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
iId = binRead.ReadInt();
iFrom = binRead.ReadInt();
if (binRead.ReadBool())
sFrom = binRead.ReadString();
else
sFrom = null;
sTo = binRead.ReadString();
sSignature = binRead.ReadString();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
}
public LemmaRule(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
#endregion
}
}

@ -0,0 +1,478 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace LemmaSharp
{
[Serializable]
public class LemmaTreeNode : ILemmatizerModel
{
#region Private Variables
//settings
private LemmatizerSettings lsett;
//tree structure references
private Dictionary<char, LemmaTreeNode> dictSubNodes;
private LemmaTreeNode ltnParentNode;
//essential node properties
private int iSimilarity; //similarity among all words in this node
private string sCondition; //suffix that must match in order to lemmatize
private bool bWholeWord; //true if condition has to match to whole word
//rules and weights;
private LemmaRule lrBestRule; //the best rule to be applied when lemmatizing
private RuleWeighted[] aBestRules; //list of best rules
private double dWeight;
//source of this node
private int iStart;
private int iEnd;
private ExampleList elExamples;
#endregion
#region Constructor(s) & Destructor(s)
private LemmaTreeNode(LemmatizerSettings lsett)
{
this.lsett = lsett;
}
public LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples)
: this(lsett, elExamples, 0, elExamples.Count - 1, null)
{
}
/// <summary>
///
/// </summary>
/// <param name="lsett"></param>
/// <param name="elExamples"></param>
/// <param name="iStart">Index of the first word of the current group</param>
/// <param name="iEnd">Index of the last word of the current group</param>
/// <param name="ltnParentNode"></param>
private LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples, int iStart, int iEnd, LemmaTreeNode ltnParentNode) : this(lsett)
{
this.ltnParentNode = ltnParentNode;
this.dictSubNodes = null;
this.iStart = iStart;
this.iEnd = iEnd;
this.elExamples = elExamples;
if (iStart >= elExamples.Count || iEnd >= elExamples.Count || iStart > iEnd)
{
lrBestRule = elExamples.Rules.DefaultRule;
aBestRules = new RuleWeighted[1];
aBestRules[0] = new RuleWeighted(lrBestRule, 0);
dWeight = 0;
return;
}
int iConditionLength = Math.Min(ltnParentNode == null ? 0 : ltnParentNode.iSimilarity + 1, elExamples[iStart].Word.Length);
this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - iConditionLength);
this.iSimilarity = elExamples[iStart].Similarity(elExamples[iEnd]);
this.bWholeWord = ltnParentNode == null ? false : elExamples[iEnd].Word.Length == ltnParentNode.iSimilarity;
FindBestRules();
AddSubAll();
//TODO check this heuristics, can be problematic when there are more applicable rules
if (dictSubNodes != null)
{
var lReplaceNodes = new List<KeyValuePair<char, LemmaTreeNode>>();
foreach (var kvpChild in dictSubNodes)
if (kvpChild.Value.dictSubNodes != null && kvpChild.Value.dictSubNodes.Count == 1)
{
var enumChildChild = kvpChild.Value.dictSubNodes.Values.GetEnumerator();
enumChildChild.MoveNext();
var ltrChildChild = enumChildChild.Current;
if (kvpChild.Value.lrBestRule == lrBestRule)
lReplaceNodes.Add(new KeyValuePair<char, LemmaTreeNode>(kvpChild.Key, ltrChildChild));
}
foreach (var kvpChild in lReplaceNodes)
{
dictSubNodes[kvpChild.Key] = kvpChild.Value;
kvpChild.Value.ltnParentNode = this;
}
}
}
#endregion
#region Public Properties
public int TreeSize
{
get
{
int iCount = 1;
if (dictSubNodes != null)
{
foreach (var ltnChild in dictSubNodes.Values)
{
iCount += ltnChild.TreeSize;
}
}
return iCount;
}
}
public double Weight
{
get
{
return dWeight;
}
}
#endregion
#region Essential Class Functions (building model)
private void FindBestRules()
{
/*
* LINQ SPEED TEST (Slower than current metodology)
*
List<LemmaExample> leApplicable = new List<LemmaExample>();
for (int iExm = iStart; iExm <= iEnd; iExm++)
if (elExamples[iExm].Rule.IsApplicableToGroup(sCondition.Length))
leApplicable.Add(elExamples[iExm]);
List<KeyValuePair<LemmaRule, double>> lBestRules = new List<KeyValuePair<LemmaRule,double>>();
lBestRules.AddRange(
leApplicable.
GroupBy<LemmaExample, LemmaRule, double, KeyValuePair<LemmaRule, double>>(
le => le.Rule,
le => le.Weight,
(lr, enumDbl) => new KeyValuePair<LemmaRule, double>(lr, enumDbl.Aggregate((acc, curr) => acc + curr))
).
OrderBy(kvpLrWght=>kvpLrWght.Value)
);
if (lBestRules.Count > 0)
lrBestRule = lBestRules[0].Key;
else {
lrBestRule = elExamples.Rules.DefaultRule;
}
*/
dWeight = 0;
//calculate dWeight of whole node and calculates qualities for all rules
var dictApplicableRules = new Dictionary<LemmaRule, double>();
//dictApplicableRules.Add(elExamples.Rules.DefaultRule, 0);
while (dictApplicableRules.Count == 0)
{
for (var iExm = iStart; iExm <= iEnd; iExm++)
{
var lr = elExamples[iExm].Rule;
var dExmWeight = elExamples[iExm].Weight;
dWeight += dExmWeight;
if (lr.IsApplicableToGroup(sCondition.Length))
{
if (dictApplicableRules.ContainsKey(lr))
dictApplicableRules[lr] += dExmWeight;
else
dictApplicableRules.Add(lr, dExmWeight);
}
}
//if none found then increase condition length or add some default appliable rule
if (dictApplicableRules.Count == 0)
{
if (this.sCondition.Length < iSimilarity)
this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - (sCondition.Length + 1));
else
//TODO preveri hevristiko, mogoce je bolje ce se doda default rule namesto rulea od starsa
dictApplicableRules.Add(ltnParentNode.lrBestRule, 0);
}
}
//TODO can optimize this step using sorted list (dont add if it's worse than the worst)
var lSortedRules = new List<RuleWeighted>();
foreach (var kvp in dictApplicableRules)
{
lSortedRules.Add(new RuleWeighted(kvp.Key, kvp.Value / dWeight));
}
lSortedRules.Sort();
//keep just best iMaxRulesPerNode rules
var iNumRules = lSortedRules.Count;
if (lsett.iMaxRulesPerNode > 0)
iNumRules = Math.Min(lSortedRules.Count, lsett.iMaxRulesPerNode);
aBestRules = new RuleWeighted[iNumRules];
for (var iRule = 0; iRule < iNumRules; iRule++)
{
aBestRules[iRule] = lSortedRules[iRule];
}
//set best rule
lrBestRule = aBestRules[0].Rule;
//TODO must check if this hevristics is OK (to privilige parent rule)
if (ltnParentNode != null)
{
for (int iRule = 0; iRule < lSortedRules.Count &&
lSortedRules[iRule].Weight == lSortedRules[0].Weight; iRule++)
{
if (lSortedRules[iRule].Rule == ltnParentNode.lrBestRule)
{
lrBestRule = lSortedRules[iRule].Rule;
break;
}
}
}
}
private void AddSubAll()
{
int iStartGroup = iStart;
var chCharPrev = '\0';
var bSubGroupNeeded = false;
for (var iWrd = iStart; iWrd <= iEnd; iWrd++)
{
var sWord = elExamples[iWrd].Word;
var chCharThis = sWord.Length > iSimilarity ? sWord[sWord.Length - 1 - iSimilarity] : '\0';
if (iWrd != iStart && chCharPrev != chCharThis)
{
if (bSubGroupNeeded)
{
AddSub(iStartGroup, iWrd - 1, chCharPrev);
bSubGroupNeeded = false;
}
iStartGroup = iWrd;
}
//TODO check out bSubGroupNeeded when there are multiple posible rules (not just lrBestRule)
if (elExamples[iWrd].Rule != lrBestRule)
{
bSubGroupNeeded = true;
}
chCharPrev = chCharThis;
}
if (bSubGroupNeeded && iStartGroup != iStart)
{
AddSub(iStartGroup, iEnd, chCharPrev);
}
}
private void AddSub(int iStart, int iEnd, char chChar)
{
var ltnSub = new LemmaTreeNode(lsett, elExamples, iStart, iEnd, this);
//TODO - maybe not realy appropriate because loosing statisitcs from multiple possible rules
if (ltnSub.lrBestRule == lrBestRule && ltnSub.dictSubNodes == null)
return;
if (dictSubNodes == null)
dictSubNodes = new Dictionary<char, LemmaTreeNode>();
dictSubNodes.Add(chChar, ltnSub);
}
#endregion
#region Essential Class Functions (running model = lemmatizing)
public bool ConditionSatisfied(string sWord)
{
//if (bWholeWord)
// return sWord == sCondition;
//else
// return sWord.EndsWith(sCondition);
var iDiff = sWord.Length - sCondition.Length;
if (iDiff < 0 || (bWholeWord && iDiff > 0))
return false;
var iWrdEnd = sCondition.Length - ltnParentNode.sCondition.Length - 1;
for (var iChar = 0; iChar < iWrdEnd; iChar++)
{
if (sCondition[iChar] != sWord[iChar + iDiff])
return false;
}
return true;
}
public string Lemmatize(string sWord)
{
if (sWord.Length >= iSimilarity && dictSubNodes != null)
{
char chChar = sWord.Length > iSimilarity ? sWord[sWord.Length - 1 - iSimilarity] : '\0';
if (dictSubNodes.ContainsKey(chChar) && dictSubNodes[chChar].ConditionSatisfied(sWord))
return dictSubNodes[chChar].Lemmatize(sWord);
}
return lrBestRule.Lemmatize(sWord);
}
#endregion
#region Output Functions (ToString)
public override string ToString()
{
var sb = new StringBuilder();
ToString(sb, 0);
return sb.ToString();
}
private void ToString(StringBuilder sb, int iLevel)
{
sb.Append(new string('\t', iLevel));
sb.AppendFormat("Suffix=\"{0}{1}\"; ", bWholeWord ? "^" : string.Empty, sCondition);
sb.AppendFormat("Rule=\"{0}\"; ", lrBestRule);
sb.AppendFormat("Weight=\"{0}\"; ", dWeight);
if (aBestRules != null && aBestRules.Length > 0)
sb.AppendFormat("Cover={0}; ", aBestRules[0].Weight);
sb.Append("Rulles=");
if (aBestRules != null)
{
foreach (var rw in aBestRules)
sb.AppendFormat(" {0}", rw);
}
sb.Append("; ");
sb.AppendLine();
if (dictSubNodes != null)
{
foreach (var ltnChild in dictSubNodes.Values)
{
ltnChild.ToString(sb, iLevel + 1);
}
}
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt)
{
binWrt.Write(dictSubNodes != null);
if (dictSubNodes != null)
{
binWrt.Write(dictSubNodes.Count);
foreach (var kvp in dictSubNodes)
{
binWrt.Write(kvp.Key);
kvp.Value.Serialize(binWrt);
}
}
binWrt.Write(iSimilarity);
binWrt.Write(sCondition);
binWrt.Write(bWholeWord);
binWrt.Write(lrBestRule.Signature);
binWrt.Write(aBestRules.Length);
for (var i = 0; i < aBestRules.Length; i++)
{
binWrt.Write(aBestRules[i].Rule.Signature);
binWrt.Write(aBestRules[i].Weight);
}
binWrt.Write(dWeight);
binWrt.Write(iStart);
binWrt.Write(iEnd);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode)
{
this.lsett = lsett;
if (binRead.ReadBoolean())
{
dictSubNodes = new Dictionary<char, LemmaTreeNode>();
var iCount = binRead.ReadInt32();
for (var i = 0; i < iCount; i++)
{
var cKey = binRead.ReadChar();
var ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this);
dictSubNodes.Add(cKey, ltrSub);
}
}
else
{
dictSubNodes = null;
}
this.ltnParentNode = ltnParentNode;
iSimilarity = binRead.ReadInt32();
sCondition = binRead.ReadString();
bWholeWord = binRead.ReadBoolean();
lrBestRule = elExamples.Rules[binRead.ReadString()];
var iCountBest = binRead.ReadInt32();
aBestRules = new RuleWeighted[iCountBest];
for (var i = 0; i < iCountBest; i++)
{
aBestRules[i] =
new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble());
}
dWeight = binRead.ReadDouble();
iStart = binRead.ReadInt32();
iEnd = binRead.ReadInt32();
this.elExamples = elExamples;
}
public LemmaTreeNode(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode)
{
Deserialize(binRead, lsett, elExamples, ltnParentNode);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
binWrt.WriteBool(dictSubNodes != null);
if (dictSubNodes != null) {
binWrt.WriteInt(dictSubNodes.Count);
foreach (KeyValuePair<char, LemmaTreeNode> kvp in dictSubNodes) {
binWrt.WriteChar(kvp.Key);
kvp.Value.Save(binWrt);
}
}
binWrt.WriteInt(iSimilarity);
binWrt.WriteString(sCondition);
binWrt.WriteBool(bWholeWord);
binWrt.WriteString(lrBestRule.Signature);
binWrt.WriteInt(aBestRules.Length);
for (int i = 0; i < aBestRules.Length; i++) {
binWrt.WriteString(aBestRules[i].Rule.Signature);
binWrt.WriteDouble(aBestRules[i].Weight);
}
binWrt.WriteDouble(dWeight);
binWrt.WriteInt(iStart);
binWrt.WriteInt(iEnd);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) {
this.lsett = lsett;
if (binRead.ReadBool()) {
dictSubNodes = new Dictionary<char, LemmaTreeNode>();
int iCount = binRead.ReadInt();
for (int i = 0; i < iCount; i++) {
char cKey = binRead.ReadChar();
LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this);
dictSubNodes.Add(cKey, ltrSub);
}
}
else
dictSubNodes = null;
this.ltnParentNode = ltnParentNode;
iSimilarity = binRead.ReadInt();
sCondition = binRead.ReadString();
bWholeWord = binRead.ReadBool();
lrBestRule = elExamples.Rules[binRead.ReadString()];
int iCountBest = binRead.ReadInt();
aBestRules = new RuleWeighted[iCountBest];
for (int i = 0; i < iCountBest; i++)
aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble());
dWeight = binRead.ReadDouble();
iStart = binRead.ReadInt();
iEnd = binRead.ReadInt();
this.elExamples = elExamples;
}
public LemmaTreeNode(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) {
Load(binRead, lsett, elExamples, ltnParentNode);
}
#endif
#endregion
#region Other (Temporarly)
//TODO - this is temp function, remove it
public bool CheckConsistency()
{
var bReturn = true;
if (dictSubNodes != null)
foreach (var ltnChild in dictSubNodes.Values)
bReturn = bReturn &&
ltnChild.CheckConsistency() &&
ltnChild.sCondition.EndsWith(sCondition);
return bReturn;
}
#endregion
}
}

@ -0,0 +1,465 @@
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Runtime.Serialization;
using System.IO.Compression;
using SevenZip;
namespace LemmaSharp
{
[Serializable]
public class Lemmatizer : ITrainableLemmatizer
#if LATINO
, Latino.ISerializable
#endif
{
#region Private Variables
protected LemmatizerSettings lsett;
protected ExampleList elExamples;
protected LemmaTreeNode ltnRootNode;
protected LemmaTreeNode ltnRootNodeFront;
#endregion
#region Constructor(s)
public Lemmatizer() :
this(new LemmatizerSettings())
{ }
public Lemmatizer(LemmatizerSettings lsett)
{
this.lsett = lsett;
this.elExamples = new ExampleList(lsett);
this.ltnRootNode = null;
this.ltnRootNodeFront = null;
}
public Lemmatizer(StreamReader srIn, string sFormat, LemmatizerSettings lsett) : this(lsett)
{
AddMultextFile(srIn, sFormat);
}
#endregion
#region Private Properties
private LemmaTreeNode ltrRootNodeSafe
{
get
{
if (ltnRootNode == null)
BuildModel();
return ltnRootNode;
}
}
private LemmaTreeNode ltrRootNodeFrontSafe
{
get
{
if (ltnRootNodeFront == null && lsett.bBuildFrontLemmatizer)
BuildModel();
return ltnRootNodeFront;
}
}
#endregion
#region Public Properties
public LemmatizerSettings Settings
{
get
{
return lsett.CloneDeep();
}
}
public ExampleList Examples
{
get
{
return elExamples;
}
}
public RuleList Rules
{
get
{
return elExamples.Rules;
}
}
public LemmaTreeNode RootNode
{
get
{
return ltrRootNodeSafe;
}
}
public LemmaTreeNode RootNodeFront
{
get
{
return ltrRootNodeFrontSafe;
}
}
public ILemmatizerModel Model
{
get
{
return ltrRootNodeSafe;
}
}
#endregion
#region Essential Class Functions (adding examples to repository)
public void AddMultextFile(StreamReader srIn, string sFormat)
{
this.elExamples.AddMultextFile(srIn, sFormat);
ltnRootNode = null;
}
public void AddExample(string sWord, string sLemma)
{
AddExample(sWord, sLemma, 1, null);
}
public void AddExample(string sWord, string sLemma, double dWeight)
{
AddExample(sWord, sLemma, dWeight, null);
}
public void AddExample(string sWord, string sLemma, double dWeight, string sMsd)
{
elExamples.AddExample(sWord, sLemma, dWeight, sMsd);
ltnRootNode = null;
}
public void DropExamples()
{
elExamples.DropExamples();
}
public void FinalizeAdditions()
{
elExamples.FinalizeAdditions();
}
#endregion
#region Essential Class Functions (building model & lemmatizing)
public void BuildModel()
{
if (ltnRootNode != null)
return;
if (!lsett.bBuildFrontLemmatizer)
{
//TODO remove: elExamples.FinalizeAdditions();
elExamples.FinalizeAdditions();
ltnRootNode = new LemmaTreeNode(lsett, elExamples);
}
else
{
ltnRootNode = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(false));
ltnRootNodeFront = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(true));
}
}
public string Lemmatize(string sWord)
{
if (!lsett.bBuildFrontLemmatizer)
{
return ltrRootNodeSafe.Lemmatize(sWord);
}
var sWordFront = LemmaExample.StringReverse(sWord);
var sLemmaFront = ltrRootNodeFrontSafe.Lemmatize(sWordFront);
var sWordRear = LemmaExample.StringReverse(sLemmaFront);
return ltrRootNodeSafe.Lemmatize(sWordRear);
}
#endregion
#region Serialization Functions (ISerializable)
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
info.AddValue("lsett", lsett);
info.AddValue("elExamples", elExamples);
}
public Lemmatizer(SerializationInfo info, StreamingContext context) : this()
{
lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings));
elExamples = (ExampleList)info.GetValue("elExamples", typeof(ExampleList));
this.BuildModel();
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bSerializeExamples)
{
lsett.Serialize(binWrt);
binWrt.Write(bSerializeExamples);
elExamples.Serialize(binWrt, bSerializeExamples, false);
if (!bSerializeExamples)
{
elExamples.GetFrontRearExampleList(false).Serialize(binWrt, bSerializeExamples, false);
elExamples.GetFrontRearExampleList(true).Serialize(binWrt, bSerializeExamples, false);
}
ltnRootNode.Serialize(binWrt);
if (lsett.bBuildFrontLemmatizer)
ltnRootNodeFront.Serialize(binWrt);
}
public void Deserialize(BinaryReader binRead)
{
lsett = new LemmatizerSettings(binRead);
var bSerializeExamples = binRead.ReadBoolean();
elExamples = new ExampleList(binRead, lsett);
ExampleList elExamplesRear;
ExampleList elExamplesFront;
if (bSerializeExamples)
{
elExamplesRear = elExamples.GetFrontRearExampleList(false);
elExamplesFront = elExamples.GetFrontRearExampleList(true);
}
else
{
elExamplesRear = new ExampleList(binRead, lsett);
elExamplesFront = new ExampleList(binRead, lsett);
}
if (!lsett.bBuildFrontLemmatizer)
{
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null);
}
else
{
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamplesRear, null);
ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamplesFront, null);
}
}
//Do not change the order!!! (If new compression algorithms are added, otherwise you will not be able to load old files.)
public enum Compression
{
None,
Deflate,
LZMA
}
public Lemmatizer(BinaryReader binRead)
{
var compr = (Compression)binRead.ReadByte();
if (compr == Compression.None)
Deserialize(binRead);
else
throw new Exception("Loading lemmatizer with binary reader on uncompressed stream is not supported.");
}
public Lemmatizer(Stream streamIn)
{
Deserialize(streamIn);
}
public void Serialize(Stream streamOut)
{
Serialize(streamOut, true, Compression.None);
}
public void Serialize(Stream streamOut, bool bSerializeExamples)
{
Serialize(streamOut, bSerializeExamples, Compression.None);
}
public void Serialize(Stream streamOut, bool bSerializeExamples, Compression compress)
{
streamOut.WriteByte((byte)compress);
switch (compress)
{
case Compression.None:
SerializeNone(streamOut, bSerializeExamples);
break;
case Compression.Deflate:
SerializeDeflate(streamOut, bSerializeExamples);
break;
case Compression.LZMA:
SerializeLZMA(streamOut, bSerializeExamples);
break;
default:
break;
}
}
private void SerializeNone(Stream streamOut, bool bSerializeExamples)
{
using (var binWrt = new BinaryWriter(streamOut))
{
this.Serialize(binWrt, bSerializeExamples);
}
}
private void SerializeDeflate(Stream streamOut, bool bSerializeExamples)
{
using (var streamOutNew = new DeflateStream(streamOut, CompressionMode.Compress, true))
{
using (var binWrt = new BinaryWriter(streamOutNew))
{
this.Serialize(binWrt, bSerializeExamples);
binWrt.Flush();
binWrt.Close();
}
}
}
private void SerializeLZMA(Stream streamOut, bool bSerializeExamples)
{
CoderPropID[] propIDs =
{
CoderPropID.DictionarySize,
CoderPropID.PosStateBits,
CoderPropID.LitContextBits,
CoderPropID.LitPosBits,
CoderPropID.Algorithm,
CoderPropID.NumFastBytes,
CoderPropID.MatchFinder,
CoderPropID.EndMarker
};
Int32 dictionary = 1 << 23;
Int32 posStateBits = 2;
Int32 litContextBits = 3; // for normal files
Int32 litPosBits = 0;
Int32 algorithm = 2;
Int32 numFastBytes = 128;
var mf = "bt4";
var eos = false;
object[] properties =
{
(Int32)(dictionary),
(Int32)(posStateBits),
(Int32)(litContextBits),
(Int32)(litPosBits),
(Int32)(algorithm),
(Int32)(numFastBytes),
mf,
eos
};
using (var msTemp = new MemoryStream())
{
using (var binWrtTemp = new BinaryWriter(msTemp))
{
this.Serialize(binWrtTemp, bSerializeExamples);
msTemp.Position = 0;
var encoder = new SevenZip.Compression.LZMA.Encoder();
encoder.SetCoderProperties(propIDs, properties);
encoder.WriteCoderProperties(streamOut);
var fileSize = msTemp.Length;
for (int i = 0; i < 8; i++)
{
streamOut.WriteByte((Byte)(fileSize >> (8 * i)));
}
encoder.Code(msTemp, streamOut, -1, -1, null);
binWrtTemp.Close();
encoder = null;
}
msTemp.Close();
}
}
public void Deserialize(Stream streamIn)
{
var compr = (Compression)streamIn.ReadByte();
using (var streamInNew = Decompress(streamIn, compr))
{
using (var br = new BinaryReader(streamInNew))
{
Deserialize(br);
}
}
}
private Stream Decompress(Stream streamIn, Compression compress)
{
Stream streamInNew;
switch (compress)
{
case Compression.None:
default:
streamInNew = streamIn;
break;
case Compression.Deflate:
streamInNew = new DeflateStream(streamIn, CompressionMode.Decompress);
break;
case Compression.LZMA:
streamInNew = DecompressLZMA(streamIn);
break;
}
return streamInNew;
}
private Stream DecompressLZMA(Stream streamIn)
{
var properties = new byte[5];
if (streamIn.Read(properties, 0, 5) != 5)
throw new Exception("input .lzma is too short");
var decoder = new SevenZip.Compression.LZMA.Decoder();
decoder.SetDecoderProperties(properties);
long outSize = 0;
for (var i = 0; i < 8; i++)
{
var v = streamIn.ReadByte();
if (v < 0)
throw (new Exception("Can't Read 1"));
outSize |= ((long)(byte)v) << (8 * i);
}
var compressedSize = streamIn.Length - streamIn.Position;
var outStream = new MemoryStream();
decoder.Code(streamIn, outStream, compressedSize, outSize, null);
outStream.Seek(0, 0);
decoder = null;
return outStream;
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
lsett.Save(binWrt);
elExamples.Save(binWrt, true, false);
ltnRootNode.Save(binWrt);
if (lsett.bBuildFrontLemmatizer)
ltnRootNodeFront.Save(binWrt);
}
public void Load(Latino.BinarySerializer binRead) {
lsett = new LemmatizerSettings(binRead);
elExamples = new ExampleList(binRead, lsett);
if (!lsett.bBuildFrontLemmatizer) {
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null);
}
else {
ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(false) , null);
ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(true), null);
}
}
public Lemmatizer(Latino.BinarySerializer binRead) {
Load(binRead);
}
public void Save(Stream streamOut) {
Latino.BinarySerializer binWrt = new Latino.BinarySerializer(streamOut);
this.Save(binWrt);
binWrt.Close();
}
public void Load(Stream streamIn) {
Latino.BinarySerializer binRead = new Latino.BinarySerializer(streamIn);
Load(binRead);
binRead.Close();
}
public Lemmatizer(Stream streamIn, string sDummy) {
Load(streamIn);
}
#endif
#endregion
}
}

@ -0,0 +1,143 @@
using System;
using System.IO;
using System.Runtime.Serialization;
namespace LemmaSharp
{
/// <summary>
/// These are the lemmagen algorithm settings that affect speed/power of the learning and lemmatizing algorithm.
/// TODO this class will be probbably removed in the future.
/// </summary>
[Serializable]
public class LemmatizerSettings : ISerializable
{
#region Constructor(s)
public LemmatizerSettings()
{
}
#endregion
#region Sub-Structures
/// <summary>
/// How algorithm considers msd tags.
/// </summary>
public enum MsdConsideration
{
/// <summary>
/// Completely ignores mds tags (join examples with different tags and sum their weihgts).
/// </summary>
Ignore,
/// <summary>
/// Same examples with different msd's are not considered equal and joined.
/// </summary>
Distinct,
/// <summary>
/// Joins examples with different tags (concatenates all msd tags).
/// </summary>
JoinAll,
/// <summary>
/// Joins examples with different tags (concatenates just distinct msd tags - somehow slower).
/// </summary>
JoinDistinct,
/// <summary>
/// Joins examples with different tags (new tag is the left to right substring that all joined examples share).
/// </summary>
JoinSameSubstring
}
#endregion
#region Public Variables
/// <summary>
/// True if from string should be included in rule identifier ([from]->[to]). False if just length of from string is used ([#len]->[to]).
/// </summary>
public bool bUseFromInRules = true;
/// <summary>
/// Specification how algorithm considers msd tags.
/// </summary>
public MsdConsideration eMsdConsider = MsdConsideration.Distinct;
/// <summary>
/// How many of the best rules are kept in memory for each node. Zero means unlimited.
/// </summary>
public int iMaxRulesPerNode = 0;
/// <summary>
/// If true, than build proccess uses few more hevristics to build first left to right lemmatizer (lemmatizes front of the word)
/// </summary>
public bool bBuildFrontLemmatizer = false;
#endregion
#region Cloneable functions
public LemmatizerSettings CloneDeep()
{
return new LemmatizerSettings()
{
bUseFromInRules = this.bUseFromInRules,
eMsdConsider = this.eMsdConsider,
iMaxRulesPerNode = this.iMaxRulesPerNode,
bBuildFrontLemmatizer = this.bBuildFrontLemmatizer
};
}
#endregion
#region Serialization Functions (ISerializable)
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
info.AddValue("bUseFromInRules", bUseFromInRules);
info.AddValue("eMsdConsider", eMsdConsider);
info.AddValue("iMaxRulesPerNode", iMaxRulesPerNode);
info.AddValue("bBuildFrontLemmatizer", bBuildFrontLemmatizer);
}
public LemmatizerSettings(SerializationInfo info, StreamingContext context)
{
bUseFromInRules = info.GetBoolean("bUseFromInRules");
eMsdConsider = (MsdConsideration)info.GetValue("eMsdConsider", typeof(MsdConsideration));
iMaxRulesPerNode = info.GetInt32("iMaxRulesPerNode");
bBuildFrontLemmatizer = info.GetBoolean("bBuildFrontLemmatizer");
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt)
{
binWrt.Write(bUseFromInRules);
binWrt.Write((int)eMsdConsider);
binWrt.Write(iMaxRulesPerNode);
binWrt.Write(bBuildFrontLemmatizer);
}
public void Deserialize(BinaryReader binRead)
{
bUseFromInRules = binRead.ReadBoolean();
eMsdConsider = (MsdConsideration)binRead.ReadInt32();
iMaxRulesPerNode = binRead.ReadInt32();
bBuildFrontLemmatizer = binRead.ReadBoolean();
}
public LemmatizerSettings(System.IO.BinaryReader binRead)
{
this.Deserialize(binRead);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
binWrt.WriteBool(bUseFromInRules);
binWrt.WriteInt((int)eMsdConsider);
binWrt.WriteInt(iMaxRulesPerNode);
binWrt.WriteBool(bBuildFrontLemmatizer);
}
public void Load(Latino.BinarySerializer binRead) {
bUseFromInRules = binRead.ReadBool();
eMsdConsider = (MsdConsideration)binRead.ReadInt();
iMaxRulesPerNode = binRead.ReadInt();
bBuildFrontLemmatizer = binRead.ReadBool();
}
public LemmatizerSettings(Latino.BinarySerializer reader) {
Load(reader);
}
#endif
#endregion
}
}

@ -0,0 +1,161 @@
using System.Collections.Generic;
using System.IO;
namespace LemmaSharp
{
public class RuleList : Dictionary<string, LemmaRule>
{
#region Private Variables
private LemmatizerSettings lsett;
private LemmaRule lrDefaultRule;
#endregion
#region Constructor(s)
public RuleList(LemmatizerSettings lsett)
{
this.lsett = lsett;
lrDefaultRule = AddRule(new LemmaRule("", "", 0, lsett));
}
#endregion
#region Public Properties
public LemmaRule DefaultRule
{
get
{
return lrDefaultRule;
}
}
#endregion
#region Essential Class Functions
public LemmaRule AddRule(LemmaExample le)
{
return AddRule(new LemmaRule(le.Word, le.Lemma, this.Count, lsett));
}
private LemmaRule AddRule(LemmaRule lrRuleNew)
{
LemmaRule lrRuleReturn = null;
if (!this.TryGetValue(lrRuleNew.Signature, out lrRuleReturn))
{
lrRuleReturn = lrRuleNew;
this.Add(lrRuleReturn.Signature, lrRuleReturn);
}
return lrRuleReturn;
}
#endregion
#region Serialization Functions (Binary)
public void Serialize(BinaryWriter binWrt, bool bThisTopObject)
{
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Serialize(binWrt);
//save list items ---------------------------------------
var iCount = this.Count;
binWrt.Write(iCount);
foreach (var kvp in this)
{
binWrt.Write(kvp.Key);
kvp.Value.Serialize(binWrt, false);
}
//default rule is already saved in the list. Here just save its id.
binWrt.Write(lrDefaultRule.Signature);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett)
{
//load metadata
var bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
//load list items ---------------------------------------
this.Clear();
int iCount = binRead.ReadInt32();
for (var iId = 0; iId < iCount; iId++)
{
var sKey = binRead.ReadString();
var lrVal = new LemmaRule(binRead, this.lsett);
this.Add(sKey, lrVal);
}
//link the default rule just Id was saved.
lrDefaultRule = this[binRead.ReadString()];
}
public RuleList(System.IO.BinaryReader binRead, LemmatizerSettings lsett)
{
this.Deserialize(binRead, lsett);
}
#endregion
#region Serialization Functions (Latino)
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Save(binWrt);
//save list items ---------------------------------------
int iCount = this.Count;
binWrt.WriteInt(iCount);
foreach (KeyValuePair<string, LemmaRule> kvp in this) {
binWrt.WriteString(kvp.Key);
kvp.Value.Save(binWrt, false);
}
//default rule is already saved in the list. Here just save its id.
binWrt.WriteString(lrDefaultRule.Signature);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
//load list items ---------------------------------------
this.Clear();
int iCount = binRead.ReadInt();
for (int iId = 0; iId < iCount; iId++) {
string sKey = binRead.ReadString();
LemmaRule lrVal = new LemmaRule(binRead, this.lsett);
this.Add(sKey, lrVal);
}
//link the default rule just Id was saved.
lrDefaultRule = this[binRead.ReadString()];
}
public RuleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
#endregion
}
}

@ -0,0 +1,50 @@
using System;
namespace LemmaSharp
{
[Serializable]
class RuleWeighted : IComparable<RuleWeighted>
{
#region Private Variables
private LemmaRule lrRule;
private double dWeight;
#endregion
#region Constructor(s)
public RuleWeighted(LemmaRule lrRule, double dWeight)
{
this.lrRule = lrRule;
this.dWeight = dWeight;
}
#endregion
#region Public Properties
public LemmaRule Rule
{
get { return lrRule; }
}
public double Weight
{
get { return dWeight; }
}
#endregion
#region Essential Class Functions (comparing objects, eg.: for sorting)
public int CompareTo(RuleWeighted rl)
{
if (this.dWeight < rl.dWeight) return 1;
if (this.dWeight > rl.dWeight) return -1;
if (this.lrRule.Id < rl.lrRule.Id) return 1;
if (this.lrRule.Id > rl.lrRule.Id) return -1;
return 0;
}
#endregion
#region Output & Serialization Functions
public override string ToString()
{
return string.Format("{0}{1:(0.00%)}", lrRule, dWeight);
}
#endregion
}
}

@ -0,0 +1,9 @@
using System.Runtime.Serialization;
namespace LemmaSharp
{
public interface ILemmatizer : ISerializable
{
string Lemmatize(string sWord);
}
}

@ -0,0 +1,8 @@
namespace LemmaSharp
{
public interface ILemmatizerModel
{
string Lemmatize(string sWord);
string ToString();
}
}

@ -0,0 +1,12 @@
namespace LemmaSharp
{
public interface ITrainableLemmatizer : ILemmatizer
{
ExampleList Examples { get; }
ILemmatizerModel Model { get; }
void AddExample(string sWord, string sLemma);
void AddExample(string sWord, string sLemma, double dWeight);
void AddExample(string sWord, string sLemma, double dWeight, string sMsd);
void BuildModel();
}
}

@ -0,0 +1,539 @@
/*==========================================================================;
*
* (c) 2004-08 JSI. All rights reserved.
*
* File: BinarySerializer.cs
* Version: 1.0
* Desc: Binary serializer
* Author: Miha Grcar
* Created on: Oct-2004
* Last modified: May-2008
* Revision: May-2008
*
***************************************************************************/
//Remark: Use this file as Latino compatibility checker. When it is included in
// the project it defines symbol LATINO, that should enable all Latino specific
// serialization functions. When excluded, this code will not be created and also
// following Latino namspace will not be added to the project.
using System;
using System.Runtime.InteropServices;
using System.Collections.Generic;
using System.Reflection;
using System.Text;
using System.IO;
#if LATINO
namespace Latino
{
/* .-----------------------------------------------------------------------
|
| Class BinarySerializer
|
'-----------------------------------------------------------------------
*/
public interface ISerializable {
// *** note that you need to implement a constructor that loads the instance if the class implements Latino.ISerializable
void Save(Latino.BinarySerializer writer);
}
public class BinarySerializer
{
private static Dictionary<string, string> m_full_to_short_type_name
= new Dictionary<string, string>();
private static Dictionary<string, string> m_short_to_full_type_name
= new Dictionary<string, string>();
private Stream m_stream;
private string m_data_dir
= ".";
private static void RegisterTypeName(string full_type_name, string short_type_name)
{
m_full_to_short_type_name.Add(full_type_name, short_type_name);
m_short_to_full_type_name.Add(short_type_name, full_type_name);
}
private static string GetFullTypeName(string short_type_name)
{
return m_short_to_full_type_name.ContainsKey(short_type_name) ? m_short_to_full_type_name[short_type_name] : short_type_name;
}
private static string GetShortTypeName(string full_type_name)
{
return m_full_to_short_type_name.ContainsKey(full_type_name) ? m_full_to_short_type_name[full_type_name] : full_type_name;
}
static BinarySerializer()
{
RegisterTypeName(typeof(bool).AssemblyQualifiedName, "b");
RegisterTypeName(typeof(byte).AssemblyQualifiedName, "ui1");
RegisterTypeName(typeof(sbyte).AssemblyQualifiedName, "i1");
RegisterTypeName(typeof(char).AssemblyQualifiedName, "c");
RegisterTypeName(typeof(double).AssemblyQualifiedName, "f8");
RegisterTypeName(typeof(float).AssemblyQualifiedName, "f4");
RegisterTypeName(typeof(int).AssemblyQualifiedName, "i4");
RegisterTypeName(typeof(uint).AssemblyQualifiedName, "ui4");
RegisterTypeName(typeof(long).AssemblyQualifiedName, "i8");
RegisterTypeName(typeof(ulong).AssemblyQualifiedName, "ui8");
RegisterTypeName(typeof(short).AssemblyQualifiedName, "i2");
RegisterTypeName(typeof(ushort).AssemblyQualifiedName, "ui2");
RegisterTypeName(typeof(string).AssemblyQualifiedName, "s");
}
public BinarySerializer(Stream stream)
{
//Utils.ThrowException(stream == null ? new ArgumentNullException("stream") : null);
m_stream = stream;
}
public BinarySerializer()
{
m_stream = new MemoryStream();
}
public BinarySerializer(string file_name, FileMode file_mode)
{
m_stream = new FileStream(file_name, file_mode); // throws ArgumentException, NotSupportedException, ArgumentNullException, SecurityException, FileNotFoundException, IOException, DirectoryNotFoundException, PathTooLongException, ArgumentOutOfRangeException
}
// *** Reading ***
private byte[] Read<T>() // Read<T>() is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
int sz = Marshal.SizeOf(typeof(T));
byte[] buffer = new byte[sz];
int num_bytes = m_stream.Read(buffer, 0, sz); // throws IOException, NotSupportedException, ObjectDisposedException
//Utils.ThrowException(num_bytes < sz ? new EndOfStreamException() : null);
return buffer;
}
public bool ReadBool()
{
return ReadByte() != 0;
}
public byte ReadByte() // ReadByte() is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
int val = m_stream.ReadByte(); // throws NotSupportedException, ObjectDisposedException
//Utils.ThrowException(val < 0 ? new EndOfStreamException() : null);
return (byte)val;
}
public sbyte ReadSByte()
{
return (sbyte)ReadByte();
}
private char ReadChar8()
{
return (char)ReadByte();
}
private char ReadChar16()
{
return BitConverter.ToChar(Read<ushort>(), 0);
}
public char ReadChar()
{
return ReadChar16();
}
public double ReadDouble()
{
return BitConverter.ToDouble(Read<double>(), 0);
}
public float ReadFloat()
{
return BitConverter.ToSingle(Read<float>(), 0);
}
public int ReadInt()
{
return BitConverter.ToInt32(Read<int>(), 0);
}
public uint ReadUInt()
{
return BitConverter.ToUInt32(Read<uint>(), 0);
}
public long ReadLong()
{
return BitConverter.ToInt64(Read<long>(), 0);
}
public ulong ReadULong()
{
return BitConverter.ToUInt64(Read<ulong>(), 0);
}
public short ReadShort()
{
return BitConverter.ToInt16(Read<short>(), 0);
}
public ushort ReadUShort()
{
return BitConverter.ToUInt16(Read<ushort>(), 0);
}
private string ReadString8()
{
int len = ReadInt();
if (len < 0) { return null; }
byte[] buffer = new byte[len];
m_stream.Read(buffer, 0, len); // throws IOException, NotSupportedException, ObjectDisposedException
return Encoding.ASCII.GetString(buffer);
}
private string ReadString16()
{
int len = ReadInt();
if (len < 0) { return null; }
byte[] buffer = new byte[len * 2];
m_stream.Read(buffer, 0, len * 2); // throws IOException, NotSupportedException, ObjectDisposedException
return Encoding.Unicode.GetString(buffer);
}
public string ReadString()
{
return ReadString16(); // throws exceptions (see ReadString16())
}
public Type ReadType()
{
string type_name = ReadString8(); // throws exceptions (see ReadString8())
//Utils.ThrowException(type_name == null ? new InvalidDataException() : null);
return Type.GetType(GetFullTypeName(type_name)); // throws TargetInvocationException, ArgumentException, TypeLoadException, FileNotFoundException, FileLoadException, BadImageFormatException
}
public ValueType ReadValue(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
//Utils.ThrowException(!type.IsValueType ? new InvalidArgumentValueException("type") : null);
if (type == typeof(bool))
{
return ReadBool();
}
else if (type == typeof(byte))
{
return ReadByte();
}
else if (type == typeof(sbyte))
{
return ReadSByte();
}
else if (type == typeof(char))
{
return ReadChar();
}
else if (type == typeof(double))
{
return ReadDouble();
}
else if (type == typeof(float))
{
return ReadFloat();
}
else if (type == typeof(int))
{
return ReadInt();
}
else if (type == typeof(uint))
{
return ReadUInt();
}
else if (type == typeof(long))
{
return ReadLong();
}
else if (type == typeof(ulong))
{
return ReadULong();
}
else if (type == typeof(short))
{
return ReadShort();
}
else if (type == typeof(ushort))
{
return ReadUShort();
}
else if (typeof(Latino.ISerializable).IsAssignableFrom(type))
{
ConstructorInfo cxtor = type.GetConstructor(new Type[] { typeof(Latino.BinarySerializer) });
//Utils.ThrowException(cxtor == null ? new ArgumentNotSupportedException("type") : null);
return (ValueType)cxtor.Invoke(new object[] { this }); // throws MemberAccessException, MethodAccessException, TargetInvocationException, NotSupportedException, SecurityException
}
else
{
//throw new ArgumentNotSupportedException("type");
throw new Exception("type");
}
}
public T ReadValue<T>()
{
return (T)(object)ReadValue(typeof(T)); // throws exceptions (see ReadValue(Type type))
}
public object ReadObject(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
switch (ReadByte())
{
case 0:
return null;
case 1:
break;
case 2:
Type type_0 = ReadType(); // throws exceptions (see ReadType())
//Utils.ThrowException(type_0 == null ? new TypeLoadException() : null);
//Utils.ThrowException(!type.IsAssignableFrom(type_0) ? new InvalidArgumentValueException("type") : null);
type = type_0;
break;
default:
throw new InvalidDataException();
}
if (type == typeof(string))
{
return ReadString();
}
else if (typeof(Latino.ISerializable).IsAssignableFrom(type))
{
ConstructorInfo cxtor = type.GetConstructor(new Type[] { typeof(Latino.BinarySerializer) });
//Utils.ThrowException(cxtor == null ? new ArgumentNotSupportedException("type") : null);
return cxtor.Invoke(new object[] { this }); // throws MemberAccessException, MethodAccessException, TargetInvocationException, NotSupportedException, SecurityException
}
else if (type.IsValueType)
{
return ReadValue(type); // throws exceptions (see ReadValue(Type type))
}
else
{
//throw new InvalidArgumentValueException("type");
throw new Exception("type");
}
}
public T ReadObject<T>()
{
return (T)ReadObject(typeof(T)); // throws exceptions (see ReadObject(Type type))
}
public object ReadValueOrObject(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
if (type.IsValueType)
{
return ReadValue(type); // throws exceptions (see ReadValue(Type type))
}
else
{
return ReadObject(type); // throws exceptions (see ReadObject(Type type))
}
}
public T ReadValueOrObject<T>()
{
return (T)ReadValueOrObject(typeof(T)); // throws exceptions (see ReadValueOrObject(Type type))
}
// *** Writing ***
private void Write(byte[] data) // Write(byte[] data) is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
m_stream.Write(data, 0, data.Length); // throws IOException, NotSupportedException, ObjectDisposedException
}
public void WriteBool(bool val)
{
WriteByte(val ? (byte)1 : (byte)0);
}
public void WriteByte(byte val) // WriteByte(byte val) is directly or indirectly called from several methods thus exceptions thrown here can also be thrown in all those methods
{
m_stream.WriteByte(val); // throws IOException, NotSupportedException, ObjectDisposedException
}
public void WriteSByte(sbyte val)
{
WriteByte((byte)val);
}
private void WriteChar8(char val)
{
WriteByte(Encoding.ASCII.GetBytes(new char[] { val })[0]);
}
private void WriteChar16(char val)
{
Write(BitConverter.GetBytes((ushort)val));
}
public void WriteChar(char val)
{
WriteChar16(val);
}
public void WriteDouble(double val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteFloat(float val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteInt(int val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteUInt(uint val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteLong(long val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteULong(ulong val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteShort(short val)
{
Write(BitConverter.GetBytes(val));
}
public void WriteUShort(ushort val)
{
Write(BitConverter.GetBytes(val));
}
private void WriteString8(string val)
{
if (val == null) { WriteInt(-1); return; }
WriteInt(val.Length);
Write(Encoding.ASCII.GetBytes(val));
}
private void WriteString16(string val)
{
if (val == null) { WriteInt(-1); return; }
WriteInt(val.Length);
Write(Encoding.Unicode.GetBytes(val));
}
public void WriteString(string val)
{
WriteString16(val);
}
public void WriteValue(ValueType val)
{
if (val is bool)
{
WriteBool((bool)val);
}
else if (val is byte)
{
WriteByte((byte)val);
}
else if (val is sbyte)
{
WriteSByte((sbyte)val);
}
else if (val is char)
{
WriteChar((char)val);
}
else if (val is double)
{
WriteDouble((double)val);
}
else if (val is float)
{
WriteFloat((float)val);
}
else if (val is int)
{
WriteInt((int)val);
}
else if (val is uint)
{
WriteUInt((uint)val);
}
else if (val is long)
{
WriteLong((long)val);
}
else if (val is ulong)
{
WriteULong((ulong)val);
}
else if (val is short)
{
WriteShort((short)val);
}
else if (val is ushort)
{
WriteUShort((ushort)val);
}
else if (val is Latino.ISerializable)
{
((Latino.ISerializable)val).Save(this); // throws serialization-related exceptions
}
else
{
//throw new ArgumentTypeException("val");
}
}
public void WriteObject(Type type, object obj)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
//Utils.ThrowException((obj != null && !type.IsAssignableFrom(obj.GetType())) ? new ArgumentTypeException("obj") : null);
if (obj == null)
{
WriteByte(0);
}
else
{
Type obj_type = obj.GetType();
if (obj_type == type)
{
WriteByte(1);
}
else
{
WriteByte(2);
WriteType(obj_type);
}
if (obj is string)
{
WriteString((string)obj);
}
else if (obj is Latino.ISerializable)
{
((Latino.ISerializable)obj).Save(this); // throws serialization-related exceptions
}
else if (obj is ValueType)
{
WriteValue((ValueType)obj); // throws exceptions (see WriteValue(ValueType val))
}
else
{
//throw new ArgumentTypeException("obj");
}
}
}
public void WriteObject<T>(T obj)
{
WriteObject(typeof(T), obj); // throws exceptions (see WriteObject(Type type, object obj))
}
public void WriteValueOrObject(Type type, object obj)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
//Utils.ThrowException(!type.IsAssignableFrom(obj.GetType()) ? new ArgumentTypeException("obj") : null);
if (type.IsValueType)
{
WriteValue((ValueType)obj); // throws exceptions (see WriteValue(ValueType val))
}
else
{
WriteObject(type, obj); // throws exceptions (see WriteObject(Type type, object obj))
}
}
public void WriteValueOrObject<T>(T obj)
{
WriteValueOrObject(typeof(T), obj); // throws exceptions (see WriteValueOrObject(Type type, object obj))
}
public void WriteType(Type type)
{
//Utils.ThrowException(type == null ? new ArgumentNullException("type") : null);
WriteString8(GetShortTypeName(type.AssemblyQualifiedName));
}
// *** Data directory ***
public string DataDir
{
get { return m_data_dir; }
set
{
//Utils.ThrowException(!Utils.VerifyPathName(value, /*must_exist=*/true) ? new InvalidArgumentValueException("DataDir") : null);
m_data_dir = value;
}
}
// *** Access to the associated stream ***
public void Close()
{
m_stream.Close();
}
public void Flush()
{
m_stream.Flush(); // throws IOException
}
public Stream Stream
{
get { return m_stream; }
}
}
}
#endif

@ -0,0 +1,175 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>9.0.21022</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{A39293C1-92D8-47B9-93A4-41F443B4F9E4}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>LemmaSharp</RootNamespace>
<AssemblyName>LemmaSharp</AssemblyName>
<TargetFrameworkVersion>v4.7.2</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<IsWebBootstrapper>true</IsWebBootstrapper>
<StartupObject>
</StartupObject>
<FileUpgradeFlags>
</FileUpgradeFlags>
<UpgradeBackupLocation>
</UpgradeBackupLocation>
<OldToolsVersion>3.5</OldToolsVersion>
<TargetFrameworkProfile />
<PublishUrl>http://localhost/LemmaSharp/</PublishUrl>
<Install>true</Install>
<InstallFrom>Web</InstallFrom>
<UpdateEnabled>true</UpdateEnabled>
<UpdateMode>Foreground</UpdateMode>
<UpdateInterval>7</UpdateInterval>
<UpdateIntervalUnits>Days</UpdateIntervalUnits>
<UpdatePeriodically>false</UpdatePeriodically>
<UpdateRequired>false</UpdateRequired>
<MapFileExtensions>true</MapFileExtensions>
<ApplicationRevision>0</ApplicationRevision>
<ApplicationVersion>1.0.0.%2a</ApplicationVersion>
<UseApplicationTrust>false</UseApplicationTrust>
<BootstrapperEnabled>true</BootstrapperEnabled>
<NuGetPackageImportStamp>
</NuGetPackageImportStamp>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x64' ">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x64' ">
<OutputPath>bin\x64\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'WPFDevelop|x64'">
<OutputPath>bin\x64\WPFDevelop\</OutputPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|AnyCPU'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>AnyCPU</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|AnyCPU'">
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>AnyCPU</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'WPFDevelop|AnyCPU'">
<OutputPath>bin\WPFDevelop\</OutputPath>
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x86'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x86\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x86</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x86'">
<OutputPath>bin\x86\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x86</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'WPFDevelop|x86'">
<OutputPath>bin\x86\WPFDevelop\</OutputPath>
<PlatformTarget>x86</PlatformTarget>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<ItemGroup>
<Reference Include="Lzma#, Version=4.12.3884.11200, Culture=neutral, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>ExternalLibs\Lzma#.dll</HintPath>
</Reference>
<Reference Include="System" />
</ItemGroup>
<ItemGroup>
<Compile Include="LatinoCompatibility\BinarySerializer.cs">
<SubType>Code</SubType>
</Compile>
<Compile Include="Interfaces\ILemmatizer.cs" />
<Compile Include="Interfaces\ILemmatizerModel.cs" />
<Compile Include="Interfaces\ILemmatizerTrainable.cs" />
<Compile Include="Classes\LemmatizerSettings.cs" />
<Compile Include="Classes\LemmaRule.cs" />
<Compile Include="Classes\Lemmatizer.cs" />
<Compile Include="Classes\LemmaTreeNode.cs" />
<Compile Include="Classes\LemmaExample.cs" />
<Compile Include="Classes\ExampleList.cs" />
<Compile Include="Classes\RuleList.cs" />
<Compile Include="Classes\RuleWeighted.cs" />
</ItemGroup>
<ItemGroup>
<BootstrapperPackage Include="Microsoft.Net.Client.3.5">
<Visible>False</Visible>
<ProductName>.NET Framework Client Profile</ProductName>
<Install>false</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.2.0">
<Visible>False</Visible>
<ProductName>.NET Framework 2.0 %28x86%29</ProductName>
<Install>true</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.0">
<Visible>False</Visible>
<ProductName>.NET Framework 3.0 %28x86%29</ProductName>
<Install>false</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5</ProductName>
<Install>false</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5 SP1</ProductName>
<Install>false</Install>
</BootstrapperPackage>
</ItemGroup>
<ItemGroup>
<Folder Include="Properties\" />
</ItemGroup>
<ItemGroup>
<None Include="app.config" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

@ -0,0 +1,3 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.7.2"/></startup></configuration>

@ -0,0 +1,28 @@
namespace LemmaSharp
{
public enum LanguagePrebuilt
{
//from Multext-East v4 lexicons
Bulgarian,
Czech,
English,
Estonian,
Persian,
French,
Hungarian,
Macedonian,
Polish,
Romanian,
Russian,
Slovak,
Slovene,
Serbian,
Ukrainian,
//from Multext lexicons
EnglishMT,
FrenchMT,
German,
Italian,
Spanish,
}
}

@ -0,0 +1,117 @@
using System;
using System.IO;
using System.Reflection;
using System.Runtime.Serialization;
namespace LemmaSharp
{
[Serializable]
public abstract class LemmatizerPrebuilt : Lemmatizer
{
#region Private Variables
private static string[] asLangMapping = new string[] {
"bg", "mlteast",
"cs", "mlteast",
"en", "mlteast",
"et", "mlteast",
"fa", "mlteast",
"fr", "mlteast",
"hu", "mlteast",
"mk", "mlteast",
"pl", "mlteast",
"ro", "mlteast",
"ru", "mlteast",
"sk", "mlteast",
"sl", "mlteast",
"sr", "mlteast",
"uk", "mlteast",
"en", "multext",
"fr", "multext",
"ge", "multext",
"it", "multext",
"sp", "multext",
};
private LanguagePrebuilt lang;
#endregion
#region Constructor(s)
public LemmatizerPrebuilt(LanguagePrebuilt lang)
: base()
{
this.lang = lang;
}
public LemmatizerPrebuilt(LanguagePrebuilt lang, LemmatizerSettings lsett)
: base(lsett)
{
this.lang = lang;
}
#endregion
#region Private Properties Helping Functions
protected string GetResourceFileName(string sFileMask)
{
return GetResourceFileName(sFileMask, lang);
}
public static string GetResourceFileName(string sFileMask, LanguagePrebuilt lang)
{
var langFileName = string.Format("{0}-{1}", asLangMapping[(int)lang * 2 + 1], asLangMapping[(int)lang * 2]);
return string.Format(sFileMask, langFileName);
}
#endregion
#region Public Properties
public LanguagePrebuilt Language
{
get
{
return lang;
}
}
public LexiconPrebuilt Lexicon
{
get
{
return GetLexicon(lang);
}
}
#endregion
#region Public Properties
public static LexiconPrebuilt GetLexicon(LanguagePrebuilt lang)
{
return (LexiconPrebuilt)Enum.Parse(typeof(LexiconPrebuilt), asLangMapping[((int)lang) * 2 + 1], true);
}
#endregion
#region Resource Management Functions
protected abstract Assembly GetExecutingAssembly();
protected Stream GetResourceStream(string sResourceShortName)
{
var assembly = GetExecutingAssembly();
string sResourceName = null;
foreach (string sResource in assembly.GetManifestResourceNames())
{
if (sResource.EndsWith(sResourceShortName))
{
sResourceName = sResource;
break;
}
}
if (string.IsNullOrEmpty(sResourceName))
return null;
return assembly.GetManifestResourceStream(sResourceName);
}
#endregion
#region Serialization Functions
public LemmatizerPrebuilt(SerializationInfo info, StreamingContext context)
: base(info, context)
{
}
#endregion
}
}

@ -0,0 +1,30 @@
using System;
using System.Reflection;
namespace LemmaSharp
{
[Serializable]
public class LemmatizerPrebuiltFull : LemmatizerPrebuilt
{
public const string FILEMASK = "full7z-{0}.lem";
#region Constructor(s)
public LemmatizerPrebuiltFull(LanguagePrebuilt lang)
: base(lang)
{
using (var stream = GetResourceStream(GetResourceFileName(FILEMASK)))
{
this.Deserialize(stream);
stream.Close();
}
}
#endregion
#region Resource Management Functions
protected override Assembly GetExecutingAssembly()
{
return Assembly.GetExecutingAssembly();
}
#endregion
}
}

@ -0,0 +1,8 @@
namespace LemmaSharp
{
public enum LexiconPrebuilt
{
MltEast,
Multext
}
}

@ -0,0 +1,140 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>9.0.21022</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{1E700D21-62D3-4525-93FE-C1FB0A1B0564}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>LemmaSharp</RootNamespace>
<AssemblyName>LemmaSharpPrebuilt</AssemblyName>
<TargetFrameworkVersion>v4.7.2</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<FileUpgradeFlags>
</FileUpgradeFlags>
<UpgradeBackupLocation>
</UpgradeBackupLocation>
<OldToolsVersion>3.5</OldToolsVersion>
<TargetFrameworkProfile />
<PublishUrl>publish\</PublishUrl>
<Install>true</Install>
<InstallFrom>Disk</InstallFrom>
<UpdateEnabled>false</UpdateEnabled>
<UpdateMode>Foreground</UpdateMode>
<UpdateInterval>7</UpdateInterval>
<UpdateIntervalUnits>Days</UpdateIntervalUnits>
<UpdatePeriodically>false</UpdatePeriodically>
<UpdateRequired>false</UpdateRequired>
<MapFileExtensions>true</MapFileExtensions>
<ApplicationRevision>0</ApplicationRevision>
<ApplicationVersion>1.0.0.%2a</ApplicationVersion>
<IsWebBootstrapper>false</IsWebBootstrapper>
<UseApplicationTrust>false</UseApplicationTrust>
<BootstrapperEnabled>true</BootstrapperEnabled>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x64' ">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x64' ">
<OutputPath>bin\x64\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<CodeAnalysisUseTypeNameInSuppression>true</CodeAnalysisUseTypeNameInSuppression>
<CodeAnalysisModuleSuppressionsFile>GlobalSuppressions.cs</CodeAnalysisModuleSuppressionsFile>
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'WPFDevelop|x64'">
<OutputPath>bin\x64\WPFDevelop\</OutputPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|AnyCPU'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>AnyCPU</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|AnyCPU'">
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>AnyCPU</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'WPFDevelop|AnyCPU'">
<OutputPath>bin\WPFDevelop\</OutputPath>
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x86'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x86\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x86</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x86'">
<OutputPath>bin\x86\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x86</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'WPFDevelop|x86'">
<OutputPath>bin\x86\WPFDevelop\</OutputPath>
<PlatformTarget>x86</PlatformTarget>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<ItemGroup>
<Compile Include="Classes\LanguagePrebuilt.cs" />
<Compile Include="Classes\LemmatizerPrebuilt.cs" />
<Compile Include="Classes\LexiconPrebuilt.cs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\LemmaSharp\LemmaSharp.csproj">
<Project>{A39293C1-92D8-47B9-93A4-41F443B4F9E4}</Project>
<Name>LemmaSharp</Name>
</ProjectReference>
</ItemGroup>
<ItemGroup>
<Folder Include="Data\" />
<Folder Include="Properties\" />
</ItemGroup>
<ItemGroup>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5 SP1</ProductName>
<Install>true</Install>
</BootstrapperPackage>
</ItemGroup>
<ItemGroup>
<None Include="app.config" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save

Powered by TurnKey Linux.