mirror of https://github.com/ogoun/Zero.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
156 lines
5.5 KiB
156 lines
5.5 KiB
namespace ZeroLevel.Services.Semantic.Helpers
|
|
{
|
|
public static class TextDistance
|
|
{
|
|
private static int MinOf3(int a, int b, int c)
|
|
{
|
|
if (a < b)
|
|
{
|
|
if (b < c) return a;
|
|
if (c < a) return c;
|
|
else return a;
|
|
}
|
|
if (c < b) return c;
|
|
else return b;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Computes the Levenshtein distance between two strings.
|
|
/// </summary>
|
|
/// <param name="s1">The first <see cref="string"/>.</param>
|
|
/// <param name="s2">The second <see cref="string"/>.</param>
|
|
/// <returns>The edit distiance between the given <see cref="string"/> objets.</returns>
|
|
public static int LevenshteinDistance(string s1, string s2)
|
|
{
|
|
// Null or empty checks
|
|
if (string.IsNullOrEmpty(s1))
|
|
{
|
|
if (string.IsNullOrEmpty(s2))
|
|
return 0;
|
|
else
|
|
return s2.Length;
|
|
}
|
|
if (string.IsNullOrEmpty(s2)) return s1.Length;
|
|
|
|
// Faster access
|
|
int s1Length = s1.Length;
|
|
int s2Length = s2.Length;
|
|
|
|
// Create two rows for computation. We don't need reconstruction so a full matrix isn't needed
|
|
var rows = new int[2][];
|
|
rows[0] = new int[s2Length + 1];
|
|
rows[1] = new int[s2Length + 1];
|
|
|
|
// Initialize first row
|
|
for (int i = 0; i <= s2Length; i++)
|
|
rows[0][i] = i;
|
|
// Row for computation
|
|
int curRow = 1;
|
|
for (int i = 0; i < s1Length; i++)
|
|
{
|
|
// Calculate first index in current row for computation
|
|
rows[curRow][0] = i + 1;
|
|
int prevRow = curRow ^ 1;
|
|
// Calculate rest of the row
|
|
for (int j = 1; j <= s2Length; j++)
|
|
{
|
|
int cost = s1[i] == s2[j - 1] ? 0 : 1;
|
|
rows[curRow][j] = MinOf3(
|
|
rows[prevRow][j] + 1, // deletion
|
|
rows[curRow][j - 1] + 1, // insertion
|
|
rows[prevRow][j - 1] + cost); // substitution
|
|
}
|
|
// Change row for computation to the next.
|
|
curRow = i & 1;
|
|
}
|
|
return rows[curRow ^ 1][s2Length];
|
|
}
|
|
|
|
/// <summary>
|
|
/// Computes the Damerau-Levenshtein distance between two strings.
|
|
/// </summary>
|
|
/// <param name="s1">The first <see cref="string"/>.</param>
|
|
/// <param name="s2">The second <see cref="string"/>.</param>
|
|
/// <returns>The edit distiance between the given <see cref="string"/> objets.</returns>
|
|
public static int DamerauLevenshteinDistance(string s1, string s2)
|
|
{
|
|
// Null or empty checks
|
|
if (string.IsNullOrEmpty(s1))
|
|
{
|
|
if (string.IsNullOrEmpty(s2))
|
|
return 0;
|
|
else
|
|
return s2.Length;
|
|
}
|
|
if (string.IsNullOrEmpty(s2)) return s1.Length;
|
|
|
|
// Faster access
|
|
int s1Length = s1.Length;
|
|
int s2Length = s2.Length;
|
|
|
|
// Create three rows for computation. We don't need reconstruction so a full matrix isn't needed
|
|
var rows = new int[3][];
|
|
rows[0] = new int[s2Length + 1];
|
|
rows[1] = new int[s2Length + 1];
|
|
rows[2] = new int[s2Length + 1];
|
|
|
|
// Initialize first row
|
|
for (int i = 0; i <= s2Length; i++)
|
|
rows[0][i] = i;
|
|
|
|
// Define rows
|
|
int transRow = -1;
|
|
int prevRow = 0;
|
|
int curRow = 1;
|
|
|
|
for (int i = 1; i <= s1Length; i++)
|
|
{
|
|
// Calculate first index in current row for computation
|
|
rows[curRow][0] = i;
|
|
|
|
// Calculate rest of the row
|
|
for (int j = 1; j <= s2Length; j++)
|
|
{
|
|
int cost = s1[i - 1] == s2[j - 1] ? 0 : 1;
|
|
rows[curRow][j] = MinOf3(
|
|
rows[prevRow][j] + 1, // deletion
|
|
rows[curRow][j - 1] + 1, // insertion
|
|
rows[prevRow][j - 1] + cost); // substitution
|
|
|
|
if (i > 1 && j > 1 && s1[i - 1] == s2[j - 2] && s1[i - 2] == s2[j - 1])
|
|
{
|
|
// Transposition
|
|
int curVal = rows[curRow][j];
|
|
int transVal = rows[transRow][j - 2] + cost;
|
|
rows[curRow][j] = curVal < transVal ? curVal : transVal;
|
|
}
|
|
}
|
|
|
|
// Update rows
|
|
switch (curRow)
|
|
{
|
|
case 0:
|
|
curRow = 1;
|
|
prevRow = 0;
|
|
transRow = 2;
|
|
break;
|
|
case 1:
|
|
curRow = 2;
|
|
prevRow = 1;
|
|
transRow = 0;
|
|
break;
|
|
case 2:
|
|
curRow = 0;
|
|
prevRow = 2;
|
|
transRow = 1;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
return rows[prevRow][s2Length];
|
|
}
|
|
}
|
|
}
|