You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Zero/ZeroLevel/Services/Semantic/Search/RabinKarp.cs

346 lines
16 KiB

using System;
using System.Collections.Generic;
namespace ZeroLevel.Services.Semantic.Helpers
{
public static class RabinKarp
{
/// <summary>
/// Searches for the first occurrence of a pattern in a target <see cref="string"/> using Rabin-Karp's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="pattern">The <see cref="string"/> to search for.</param>
/// <returns>Returns the position of the first occurrence of the pattern. If not found returns -1.</returns>
public static int RabinKarpSearchFirst(string target, string pattern)
{
if (target == null!) throw new ArgumentNullException(nameof(target));
if (pattern == null!) throw new ArgumentNullException(nameof(pattern));
// Save for faster access
int patternLength = pattern.Length;
if (target.Length < patternLength) return -1;
ulong targetHash = 0;
ulong patternHash = 0;
ulong alphabetSize = 256; // max char value
ulong moduloValue = 65537; // custom selected prime number for the hashing
// Calculating hash of pattern and the beggining of target
for (int i = 0; i < patternLength; i++)
{
patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i]) % moduloValue;
}
// Check if pattern is in the beginning
if (patternHash == targetHash)
if (string.Equals(target.Substring(0, patternLength), pattern))
return 0;
// Calculate pow value (used in the hashing proccess)
ulong pow = 1;
for (int i = 0; i < patternLength - 1; i++)
{
pow = (pow * alphabetSize) % moduloValue;
}
// Hashing the rest of the target and searching for the pattern
int endOfSearch = target.Length - patternLength;
for (int i = 0; i < endOfSearch; i++)
{
// Some Rabin-Karp magic
targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
// If the hashes are equal check the string( because collisions are possible) and return if found
if (targetHash == patternHash)
if (string.Equals(target.Substring(i + 1, patternLength), pattern))
return i + 1;
}
// The pattern was not found
return -1;
}
/// <summary>
/// Searches for all occurences of a pattern in a target <see cref="string"/> using Rabin-Karp's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="pattern">The <see cref="string"/> to search for.</param>
/// <returns>Returns <see cref="IList{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs. <see cref="IList{T}"/> is empty if none found.</returns>
public static IList<int> RabinKarpSearchAll(string target, string pattern)
{
if (target == null!) throw new ArgumentNullException(nameof(target));
if (pattern == null!) throw new ArgumentNullException(nameof(pattern));
// Save for faster access
int patternLength = pattern.Length;
// List with the positions where the pattern was found
var matches = new List<int>();
if (target.Length < patternLength) return matches;
ulong targetHash = 0;
ulong patternHash = 0;
ulong alphabetSize = 256; // max char value
ulong moduloValue = 65537; // custom selected prime number for the hashing
// Calculating hash of pattern and the beggining of target
for (int i = 0; i < patternLength; i++)
{
patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i]) % moduloValue;
}
// Check if pattern is in the beginning
if (patternHash == targetHash)
if (string.Equals(target.Substring(0, patternLength), pattern))
matches.Add(0);
// Calculate pow value (used in the hashing proccess)
ulong pow = 1;
for (int i = 0; i < patternLength - 1; i++)
{
pow = (pow * alphabetSize) % moduloValue;
}
// Hashing the rest of the target and searching for the pattern
int endOfSearch = target.Length - patternLength;
for (int i = 0; i < endOfSearch; i++)
{
// Some Rabin-Karp magic
targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
// If the hashes are equal check the string( because collisions are possible) and return if found
if (targetHash == patternHash)
if (string.Equals(target.Substring(i + 1, patternLength), pattern))
matches.Add(i + 1);
}
// Retrun the list with all starting positions of the pattern
return matches;
}
/// <summary>
/// Searches for the first occurrence of multiple patterns in a target <see cref="string"/> using Rabin-Karp's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="int"/> values of the position of first occurence.
/// If a pattern is not found there is no entry in the dictionary.</returns>
public static Dictionary<string, int> RabinKarpMultipleSearchFirst(string target, IList<string> patterns)
{
if (target == null!) throw new ArgumentNullException(nameof(target));
if (patterns == null!) throw new ArgumentNullException(nameof(patterns));
// Dictionary with pattern hashes for all strings
var patternHashes = new Dictionary<string, ulong>();
// Dictionary with target hashes for all different string lengths
var targetHashes = new Dictionary<int, ulong>();
// Dictionary with pow values for all different string lengths
var pows = new Dictionary<int, ulong>();
// Dictionary with all strings with a specific length
var patternLengths = new Dictionary<int, List<string>>();
// Dictionary with found positions for every string
var matches = new Dictionary<string, int>();
ulong alphabetSize = 256; // max char value
ulong moduloValue = 65537; // custom selected prime number for the hashing
// Calculating hash of patterns and all target hashes and pow values
for (int i = 0; i < patterns.Count; i++)
{
// Chech if target hash for current string length has to be computed
bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length);
// Populate pattern lengths dictionary
if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List<string>() { patterns[i] });
else patternLengths[patterns[i].Length].Add(patterns[i]);
ulong patternHash = 0;
ulong targetHash = 0;
ulong pow = 1;
for (int j = 0; j < patterns[i].Length; j++)
{
patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue;
if (hasToComputeTargetHashAndPow)
{
targetHash = (targetHash * alphabetSize + target[j]) % moduloValue;
if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration
pow = (pow * alphabetSize) % moduloValue;
}
}
// Add hashes in collections
patternHashes.Add(patterns[i], patternHash);
if (hasToComputeTargetHashAndPow)
{
targetHashes.Add(patterns[i].Length, targetHash);
pows.Add(patterns[i].Length, pow);
}
}
// Check if pattern is in the beginning
foreach (var patKVP in patternHashes)
{
if (patKVP.Value == targetHashes[patKVP.Key.Length])
if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key))
matches.Add(patKVP.Key, 0);
}
// Hashing the rest of the target and searching for the pattern
// Patters are grouped by their length
foreach (var patternsWithSpecificLength in patternLengths)
{
int patternLength = patternsWithSpecificLength.Key;
int endOfSearch = target.Length - patternLength;
for (int i = 0; i < endOfSearch; i++)
{
ulong targetHash = targetHashes[patternLength];
// Some Rabin-Karp magic
targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
targetHashes[patternLength] = targetHash;
// Search all patterns for a match
foreach (var pat in patternsWithSpecificLength.Value)
{
if (!matches.ContainsKey(pat))
{
// If the hashes are equal check the string( because collisions are possible) and return if found
if (targetHash == patternHashes[pat])
if (string.Equals(target.Substring(i + 1, patternLength), pat))
matches.Add(pat, i + 1);
}
if (matches.Count == patterns.Count) return matches;
}
if (matches.Count == patterns.Count) return matches;
}
}
// Return matches
return matches;
}
/// <summary>
/// Searches for all occurrences of multiple patterns in a target <see cref="string"/> using Rabin-Karp's algorithm.
/// </summary>
/// <param name="target">The <see cref="string"/> to search in.</param>
/// <param name="patterns">A <see cref="IList{T}"/> of <see cref="string"/> patterns.</param>
/// <returns>Retruns <see cref="Dictionary{TKey, TValue}"/> with <see cref="string"/> keys of the patterns and <see cref="List{T}"/> of <see cref="int"/> values of the positions at which the pattern occurs.
/// If a pattern is not found there is no entry in the dictionary.</returns>
public static Dictionary<string, List<int>> RabinKarpMultipleSearchAll(string target, IList<string> patterns)
{
if (target == null!) throw new ArgumentNullException(nameof(target));
if (patterns == null!) throw new ArgumentNullException(nameof(patterns));
// Dictionary with pattern hashes for all strings
var patternHashes = new Dictionary<string, ulong>();
// Dictionary with target hashes for all different string lengths
var targetHashes = new Dictionary<int, ulong>();
// Dictionary with pow values for all different string lengths
var pows = new Dictionary<int, ulong>();
// Dictionary with all strings with a specific length
var patternLengths = new Dictionary<int, List<string>>();
// Dictionary with found positions for every string
var matches = new Dictionary<string, List<int>>();
ulong alphabetSize = 256; // max char value
ulong moduloValue = 65537; // custom selected prime number for the hashing
// Calculating hash of patterns and all target hashes and pow values
for (int i = 0; i < patterns.Count; i++)
{
// Chech if target hash for current string length has to be computed
bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length);
// Populate matches dictionary and pattern lengths dictionary
matches.Add(patterns[i], new List<int>());
if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List<string>() { patterns[i] });
else patternLengths[patterns[i].Length].Add(patterns[i]);
ulong patternHash = 0;
ulong targetHash = 0;
ulong pow = 1;
for (int j = 0; j < patterns[i].Length; j++)
{
patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue;
if (hasToComputeTargetHashAndPow)
{
targetHash = (targetHash * alphabetSize + target[j]) % moduloValue;
if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration
pow = (pow * alphabetSize) % moduloValue;
}
}
// Add hashes in collections
patternHashes.Add(patterns[i], patternHash);
if (hasToComputeTargetHashAndPow)
{
targetHashes.Add(patterns[i].Length, targetHash);
pows.Add(patterns[i].Length, pow);
}
}
// Check if pattern is in the beginning
foreach (var patKVP in patternHashes)
{
if (patKVP.Value == targetHashes[patKVP.Key.Length])
if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key))
matches[patKVP.Key].Add(0);
}
// Hashing the rest of the target and searching for the pattern
// Patters are grouped by their length
foreach (var patternsWithSpecificLength in patternLengths)
{
int patternLength = patternsWithSpecificLength.Key;
int endOfSearch = target.Length - patternLength;
for (int i = 0; i < endOfSearch; i++)
{
ulong targetHash = targetHashes[patternLength];
// Some Rabin-Karp magic
targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue;
targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue;
targetHashes[patternLength] = targetHash;
// Search all patterns for a match
foreach (var pat in patternsWithSpecificLength.Value)
{
// If the hashes are equal check the string( because collisions are possible) and return if found
if (targetHash == patternHashes[pat])
if (string.Equals(target.Substring(i + 1, patternLength), pat))
matches[pat].Add(i + 1);
}
}
}
// Remove all patterns that are not found
for (int i = 0; i < patterns.Count; i++)
{
if (matches[patterns[i]].Count == 0)
{
matches.Remove(patterns[i]);
}
}
// Return matches
return matches;
}
}
}

Powered by TurnKey Linux.