mirror of https://github.com/ogoun/Zero.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
346 lines
16 KiB
346 lines
16 KiB
using System;
|
|
using System.Collections.Generic;
|
|
|
|
namespace ZeroLevel.Services.Semantic.Helpers
|
|
{
|
|
public static class RabinKarp
|
|
{
|
|
/// <summary>
|
|
/// Searches for the first occurrence of a pattern in a target <see cref="string"/> using Rabin-Karp's algorithm.
|
|
/// </summary>
|
|
/// <param name="target">The <see cref="string"/> to search in.</param>
|
|
/// <param name="pattern">The <see cref="string"/> to search for.</param>
|
|
/// <returns>Returns the position of the first occurrence of the pattern. If not found returns -1.</returns>
|
|
public static int RabinKarpSearchFirst(string target, string pattern)
|
|
{
|
|
if (target == null) throw new ArgumentNullException(nameof(target));
|
|
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
|
|
|
|
// Save for faster access
|
|
int patternLength = pattern.Length;
|
|
|
|
if (target.Length < patternLength) return -1;
|
|
|
|
ulong targetHash = 0;
|
|
ulong patternHash = 0;
|
|
ulong alphabetSize = 256; // max char value
|
|
ulong moduloValue = 65537; // custom selected prime number for the hashing
|
|
|
|
// Calculating hash of pattern and the beggining of target
|
|
for (int i = 0; i < patternLength; i++)
|
|
{
|
|
patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue;
|
|
targetHash = (targetHash * alphabetSize + target[i]) % moduloValue;
|
|
}
|
|
|
|
// Check if pattern is in the beginning
|
|
if (patternHash == targetHash)
|
|
if (string.Equals(target.Substring(0, patternLength), pattern))
|
|
return 0;
|
|
|
|
// Calculate pow value (used in the hashing proccess)
|
|
ulong pow = 1;
|
|
for (int i = 0; i < patternLength - 1; i++)
|
|
{
|
|
pow = (pow * alphabetSize) % moduloValue;
|
|
}
|
|
|
|
// Hashing the rest of the target and searching for the pattern
|
|
int endOfSearch = target.Length - patternLength;
|
|
|
|
for (int i = 0; i < endOfSearch; i++)
|
|
{
|
|
// Some Rabin-Karp magic
|
|
targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue;
|
|
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
|
|
|
|
// If the hashes are equal check the string( because collisions are possible) and return if found
|
|
if (targetHash == patternHash)
|
|
if (string.Equals(target.Substring(i + 1, patternLength), pattern))
|
|
return i + 1;
|
|
}
|
|
|
|
// The pattern was not found
|
|
return -1;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Searches for all occurences of a pattern in a target <see cref="string"/> using Rabin-Karp's algorithm.
|
|
/// </summary>
|
|
/// <param name="target">The <see cref="string"/> to search in.</param>
|
|
/// <param name="pattern">The <see cref="string"/> to search for.</param>
|
|
/// <returns>Returns <see cref="IList{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs. <see cref="IList{T}"/> is empty if none found.</returns>
|
|
public static IList<int> RabinKarpSearchAll(string target, string pattern)
|
|
{
|
|
if (target == null) throw new ArgumentNullException(nameof(target));
|
|
if (pattern == null) throw new ArgumentNullException(nameof(pattern));
|
|
|
|
// Save for faster access
|
|
int patternLength = pattern.Length;
|
|
|
|
// List with the positions where the pattern was found
|
|
var matches = new List<int>();
|
|
|
|
if (target.Length < patternLength) return matches;
|
|
|
|
ulong targetHash = 0;
|
|
ulong patternHash = 0;
|
|
ulong alphabetSize = 256; // max char value
|
|
ulong moduloValue = 65537; // custom selected prime number for the hashing
|
|
|
|
// Calculating hash of pattern and the beggining of target
|
|
for (int i = 0; i < patternLength; i++)
|
|
{
|
|
patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue;
|
|
targetHash = (targetHash * alphabetSize + target[i]) % moduloValue;
|
|
}
|
|
|
|
// Check if pattern is in the beginning
|
|
if (patternHash == targetHash)
|
|
if (string.Equals(target.Substring(0, patternLength), pattern))
|
|
matches.Add(0);
|
|
|
|
// Calculate pow value (used in the hashing proccess)
|
|
ulong pow = 1;
|
|
for (int i = 0; i < patternLength - 1; i++)
|
|
{
|
|
pow = (pow * alphabetSize) % moduloValue;
|
|
}
|
|
|
|
// Hashing the rest of the target and searching for the pattern
|
|
int endOfSearch = target.Length - patternLength;
|
|
|
|
for (int i = 0; i < endOfSearch; i++)
|
|
{
|
|
// Some Rabin-Karp magic
|
|
targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue;
|
|
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
|
|
|
|
// If the hashes are equal check the string( because collisions are possible) and return if found
|
|
if (targetHash == patternHash)
|
|
if (string.Equals(target.Substring(i + 1, patternLength), pattern))
|
|
matches.Add(i + 1);
|
|
}
|
|
|
|
// Retrun the list with all starting positions of the pattern
|
|
return matches;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Searches for the first occurrence of multiple patterns in a target <see cref="string"/> using Rabin-Karp's algorithm.
|
|
/// </summary>
|
|
/// <param name="target">The <see cref="string"/> to search in.</param>
|
|
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
|
|
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="int"/> values of the position of first occurence.
|
|
/// If a pattern is not found there is no entry in the dictionary.</returns>
|
|
public static Dictionary<string, int> RabinKarpMultipleSearchFirst(string target, IList<string> patterns)
|
|
{
|
|
if (target == null) throw new ArgumentNullException(nameof(target));
|
|
if (patterns == null) throw new ArgumentNullException(nameof(patterns));
|
|
|
|
// Dictionary with pattern hashes for all strings
|
|
var patternHashes = new Dictionary<string, ulong>();
|
|
// Dictionary with target hashes for all different string lengths
|
|
var targetHashes = new Dictionary<int, ulong>();
|
|
// Dictionary with pow values for all different string lengths
|
|
var pows = new Dictionary<int, ulong>();
|
|
// Dictionary with all strings with a specific length
|
|
var patternLengths = new Dictionary<int, List<string>>();
|
|
// Dictionary with found positions for every string
|
|
var matches = new Dictionary<string, int>();
|
|
|
|
ulong alphabetSize = 256; // max char value
|
|
ulong moduloValue = 65537; // custom selected prime number for the hashing
|
|
|
|
// Calculating hash of patterns and all target hashes and pow values
|
|
for (int i = 0; i < patterns.Count; i++)
|
|
{
|
|
// Chech if target hash for current string length has to be computed
|
|
bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length);
|
|
|
|
// Populate pattern lengths dictionary
|
|
if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List<string>() { patterns[i] });
|
|
else patternLengths[patterns[i].Length].Add(patterns[i]);
|
|
|
|
ulong patternHash = 0;
|
|
ulong targetHash = 0;
|
|
ulong pow = 1;
|
|
for (int j = 0; j < patterns[i].Length; j++)
|
|
{
|
|
patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue;
|
|
if (hasToComputeTargetHashAndPow)
|
|
{
|
|
targetHash = (targetHash * alphabetSize + target[j]) % moduloValue;
|
|
if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration
|
|
pow = (pow * alphabetSize) % moduloValue;
|
|
}
|
|
}
|
|
|
|
// Add hashes in collections
|
|
patternHashes.Add(patterns[i], patternHash);
|
|
if (hasToComputeTargetHashAndPow)
|
|
{
|
|
targetHashes.Add(patterns[i].Length, targetHash);
|
|
pows.Add(patterns[i].Length, pow);
|
|
}
|
|
}
|
|
|
|
// Check if pattern is in the beginning
|
|
foreach (var patKVP in patternHashes)
|
|
{
|
|
if (patKVP.Value == targetHashes[patKVP.Key.Length])
|
|
if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key))
|
|
matches.Add(patKVP.Key, 0);
|
|
}
|
|
|
|
// Hashing the rest of the target and searching for the pattern
|
|
// Patters are grouped by their length
|
|
foreach (var patternsWithSpecificLength in patternLengths)
|
|
{
|
|
int patternLength = patternsWithSpecificLength.Key;
|
|
int endOfSearch = target.Length - patternLength;
|
|
|
|
for (int i = 0; i < endOfSearch; i++)
|
|
{
|
|
ulong targetHash = targetHashes[patternLength];
|
|
|
|
// Some Rabin-Karp magic
|
|
targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue;
|
|
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
|
|
|
|
targetHashes[patternLength] = targetHash;
|
|
|
|
// Search all patterns for a match
|
|
foreach (var pat in patternsWithSpecificLength.Value)
|
|
{
|
|
if (!matches.ContainsKey(pat))
|
|
{
|
|
// If the hashes are equal check the string( because collisions are possible) and return if found
|
|
if (targetHash == patternHashes[pat])
|
|
if (string.Equals(target.Substring(i + 1, patternLength), pat))
|
|
matches.Add(pat, i + 1);
|
|
}
|
|
|
|
if (matches.Count == patterns.Count) return matches;
|
|
}
|
|
|
|
if (matches.Count == patterns.Count) return matches;
|
|
}
|
|
}
|
|
|
|
// Return matches
|
|
return matches;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Searches for all occurrences of multiple patterns in a target <see cref="string"/> using Rabin-Karp's algorithm.
|
|
/// </summary>
|
|
/// <param name="target">The <see cref="string"/> to search in.</param>
|
|
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
|
|
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="List{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs.
|
|
/// If a pattern is not found there is no entry in the dictionary.</returns>
|
|
public static Dictionary<string, List<int>> RabinKarpMultipleSearchAll(string target, IList<string> patterns)
|
|
{
|
|
if (target == null) throw new ArgumentNullException(nameof(target));
|
|
if (patterns == null) throw new ArgumentNullException(nameof(patterns));
|
|
|
|
// Dictionary with pattern hashes for all strings
|
|
var patternHashes = new Dictionary<string, ulong>();
|
|
// Dictionary with target hashes for all different string lengths
|
|
var targetHashes = new Dictionary<int, ulong>();
|
|
// Dictionary with pow values for all different string lengths
|
|
var pows = new Dictionary<int, ulong>();
|
|
// Dictionary with all strings with a specific length
|
|
var patternLengths = new Dictionary<int, List<string>>();
|
|
// Dictionary with found positions for every string
|
|
var matches = new Dictionary<string, List<int>>();
|
|
|
|
ulong alphabetSize = 256; // max char value
|
|
ulong moduloValue = 65537; // custom selected prime number for the hashing
|
|
|
|
// Calculating hash of patterns and all target hashes and pow values
|
|
for (int i = 0; i < patterns.Count; i++)
|
|
{
|
|
// Chech if target hash for current string length has to be computed
|
|
bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length);
|
|
|
|
// Populate matches dictionary and pattern lengths dictionary
|
|
matches.Add(patterns[i], new List<int>());
|
|
if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List<string>() { patterns[i] });
|
|
else patternLengths[patterns[i].Length].Add(patterns[i]);
|
|
|
|
ulong patternHash = 0;
|
|
ulong targetHash = 0;
|
|
ulong pow = 1;
|
|
for (int j = 0; j < patterns[i].Length; j++)
|
|
{
|
|
patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue;
|
|
if (hasToComputeTargetHashAndPow)
|
|
{
|
|
targetHash = (targetHash * alphabetSize + target[j]) % moduloValue;
|
|
if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration
|
|
pow = (pow * alphabetSize) % moduloValue;
|
|
}
|
|
}
|
|
|
|
// Add hashes in collections
|
|
patternHashes.Add(patterns[i], patternHash);
|
|
if (hasToComputeTargetHashAndPow)
|
|
{
|
|
targetHashes.Add(patterns[i].Length, targetHash);
|
|
pows.Add(patterns[i].Length, pow);
|
|
}
|
|
}
|
|
|
|
// Check if pattern is in the beginning
|
|
foreach (var patKVP in patternHashes)
|
|
{
|
|
if (patKVP.Value == targetHashes[patKVP.Key.Length])
|
|
if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key))
|
|
matches[patKVP.Key].Add(0);
|
|
|
|
}
|
|
|
|
// Hashing the rest of the target and searching for the pattern
|
|
// Patters are grouped by their length
|
|
foreach (var patternsWithSpecificLength in patternLengths)
|
|
{
|
|
int patternLength = patternsWithSpecificLength.Key;
|
|
int endOfSearch = target.Length - patternLength;
|
|
|
|
for (int i = 0; i < endOfSearch; i++)
|
|
{
|
|
ulong targetHash = targetHashes[patternLength];
|
|
|
|
// Some Rabin-Karp magic
|
|
targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue;
|
|
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
|
|
|
|
targetHashes[patternLength] = targetHash;
|
|
|
|
// Search all patterns for a match
|
|
foreach (var pat in patternsWithSpecificLength.Value)
|
|
{
|
|
// If the hashes are equal check the string( because collisions are possible) and return if found
|
|
if (targetHash == patternHashes[pat])
|
|
if (string.Equals(target.Substring(i + 1, patternLength), pat))
|
|
matches[pat].Add(i + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remove all patterns that are not found
|
|
for (int i = 0; i < patterns.Count; i++)
|
|
{
|
|
if (matches[patterns[i]].Count == 0)
|
|
{
|
|
matches.Remove(patterns[i]);
|
|
}
|
|
}
|
|
|
|
// Return matches
|
|
return matches;
|
|
}
|
|
}
|
|
}
|