using System; using System.Collections.Generic; namespace ZeroLevel.Services.Semantic.Helpers { public static class RabinKarp { /// /// Searches for the first occurrence of a pattern in a target using Rabin-Karp's algorithm. /// /// The to search in. /// The to search for. /// Returns the position of the first occurrence of the pattern. If not found returns -1. public static int RabinKarpSearchFirst(string target, string pattern) { if (target == null) throw new ArgumentNullException(nameof(target)); if (pattern == null) throw new ArgumentNullException(nameof(pattern)); // Save for faster access int patternLength = pattern.Length; if (target.Length < patternLength) return -1; ulong targetHash = 0; ulong patternHash = 0; ulong alphabetSize = 256; // max char value ulong moduloValue = 65537; // custom selected prime number for the hashing // Calculating hash of pattern and the beggining of target for (int i = 0; i < patternLength; i++) { patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue; targetHash = (targetHash * alphabetSize + target[i]) % moduloValue; } // Check if pattern is in the beginning if (patternHash == targetHash) if (string.Equals(target.Substring(0, patternLength), pattern)) return 0; // Calculate pow value (used in the hashing proccess) ulong pow = 1; for (int i = 0; i < patternLength - 1; i++) { pow = (pow * alphabetSize) % moduloValue; } // Hashing the rest of the target and searching for the pattern int endOfSearch = target.Length - patternLength; for (int i = 0; i < endOfSearch; i++) { // Some Rabin-Karp magic targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue; targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue; // If the hashes are equal check the string( because collisions are possible) and return if found if (targetHash == patternHash) if (string.Equals(target.Substring(i + 1, patternLength), pattern)) return i + 1; } // The pattern was not found return -1; } /// /// Searches for all occurences of a pattern in a target using Rabin-Karp's algorithm. /// /// The to search in. /// The to search for. /// Returns of values of the positions at which the pattern occurs. is empty if none found. public static IList RabinKarpSearchAll(string target, string pattern) { if (target == null) throw new ArgumentNullException(nameof(target)); if (pattern == null) throw new ArgumentNullException(nameof(pattern)); // Save for faster access int patternLength = pattern.Length; // List with the positions where the pattern was found var matches = new List(); if (target.Length < patternLength) return matches; ulong targetHash = 0; ulong patternHash = 0; ulong alphabetSize = 256; // max char value ulong moduloValue = 65537; // custom selected prime number for the hashing // Calculating hash of pattern and the beggining of target for (int i = 0; i < patternLength; i++) { patternHash = (patternHash * alphabetSize + pattern[i]) % moduloValue; targetHash = (targetHash * alphabetSize + target[i]) % moduloValue; } // Check if pattern is in the beginning if (patternHash == targetHash) if (string.Equals(target.Substring(0, patternLength), pattern)) matches.Add(0); // Calculate pow value (used in the hashing proccess) ulong pow = 1; for (int i = 0; i < patternLength - 1; i++) { pow = (pow * alphabetSize) % moduloValue; } // Hashing the rest of the target and searching for the pattern int endOfSearch = target.Length - patternLength; for (int i = 0; i < endOfSearch; i++) { // Some Rabin-Karp magic targetHash = (targetHash + moduloValue - pow * target[i] % moduloValue) % moduloValue; targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue; // If the hashes are equal check the string( because collisions are possible) and return if found if (targetHash == patternHash) if (string.Equals(target.Substring(i + 1, patternLength), pattern)) matches.Add(i + 1); } // Retrun the list with all starting positions of the pattern return matches; } /// /// Searches for the first occurrence of multiple patterns in a target using Rabin-Karp's algorithm. /// /// The to search in. /// A of patterns. /// Retruns with keys of the patterns and values of the position of first occurence. /// If a pattern is not found there is no entry in the dictionary. public static Dictionary RabinKarpMultipleSearchFirst(string target, IList patterns) { if (target == null) throw new ArgumentNullException(nameof(target)); if (patterns == null) throw new ArgumentNullException(nameof(patterns)); // Dictionary with pattern hashes for all strings var patternHashes = new Dictionary(); // Dictionary with target hashes for all different string lengths var targetHashes = new Dictionary(); // Dictionary with pow values for all different string lengths var pows = new Dictionary(); // Dictionary with all strings with a specific length var patternLengths = new Dictionary>(); // Dictionary with found positions for every string var matches = new Dictionary(); ulong alphabetSize = 256; // max char value ulong moduloValue = 65537; // custom selected prime number for the hashing // Calculating hash of patterns and all target hashes and pow values for (int i = 0; i < patterns.Count; i++) { // Chech if target hash for current string length has to be computed bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length); // Populate pattern lengths dictionary if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List() { patterns[i] }); else patternLengths[patterns[i].Length].Add(patterns[i]); ulong patternHash = 0; ulong targetHash = 0; ulong pow = 1; for (int j = 0; j < patterns[i].Length; j++) { patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue; if (hasToComputeTargetHashAndPow) { targetHash = (targetHash * alphabetSize + target[j]) % moduloValue; if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration pow = (pow * alphabetSize) % moduloValue; } } // Add hashes in collections patternHashes.Add(patterns[i], patternHash); if (hasToComputeTargetHashAndPow) { targetHashes.Add(patterns[i].Length, targetHash); pows.Add(patterns[i].Length, pow); } } // Check if pattern is in the beginning foreach (var patKVP in patternHashes) { if (patKVP.Value == targetHashes[patKVP.Key.Length]) if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key)) matches.Add(patKVP.Key, 0); } // Hashing the rest of the target and searching for the pattern // Patters are grouped by their length foreach (var patternsWithSpecificLength in patternLengths) { int patternLength = patternsWithSpecificLength.Key; int endOfSearch = target.Length - patternLength; for (int i = 0; i < endOfSearch; i++) { ulong targetHash = targetHashes[patternLength]; // Some Rabin-Karp magic targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue; targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue; targetHashes[patternLength] = targetHash; // Search all patterns for a match foreach (var pat in patternsWithSpecificLength.Value) { if (!matches.ContainsKey(pat)) { // If the hashes are equal check the string( because collisions are possible) and return if found if (targetHash == patternHashes[pat]) if (string.Equals(target.Substring(i + 1, patternLength), pat)) matches.Add(pat, i + 1); } if (matches.Count == patterns.Count) return matches; } if (matches.Count == patterns.Count) return matches; } } // Return matches return matches; } /// /// Searches for all occurrences of multiple patterns in a target using Rabin-Karp's algorithm. /// /// The to search in. /// A of patterns. /// Retruns with keys of the patterns and of values of the positions at which the pattern occurs. /// If a pattern is not found there is no entry in the dictionary. public static Dictionary> RabinKarpMultipleSearchAll(string target, IList patterns) { if (target == null) throw new ArgumentNullException(nameof(target)); if (patterns == null) throw new ArgumentNullException(nameof(patterns)); // Dictionary with pattern hashes for all strings var patternHashes = new Dictionary(); // Dictionary with target hashes for all different string lengths var targetHashes = new Dictionary(); // Dictionary with pow values for all different string lengths var pows = new Dictionary(); // Dictionary with all strings with a specific length var patternLengths = new Dictionary>(); // Dictionary with found positions for every string var matches = new Dictionary>(); ulong alphabetSize = 256; // max char value ulong moduloValue = 65537; // custom selected prime number for the hashing // Calculating hash of patterns and all target hashes and pow values for (int i = 0; i < patterns.Count; i++) { // Chech if target hash for current string length has to be computed bool hasToComputeTargetHashAndPow = !targetHashes.ContainsKey(patterns[i].Length); // Populate matches dictionary and pattern lengths dictionary matches.Add(patterns[i], new List()); if (hasToComputeTargetHashAndPow) patternLengths.Add(patterns[i].Length, new List() { patterns[i] }); else patternLengths[patterns[i].Length].Add(patterns[i]); ulong patternHash = 0; ulong targetHash = 0; ulong pow = 1; for (int j = 0; j < patterns[i].Length; j++) { patternHash = (patternHash * alphabetSize + patterns[i][j]) % moduloValue; if (hasToComputeTargetHashAndPow) { targetHash = (targetHash * alphabetSize + target[j]) % moduloValue; if (j != 0) // used to skip one iteration. Pow is calculated with one less iteration pow = (pow * alphabetSize) % moduloValue; } } // Add hashes in collections patternHashes.Add(patterns[i], patternHash); if (hasToComputeTargetHashAndPow) { targetHashes.Add(patterns[i].Length, targetHash); pows.Add(patterns[i].Length, pow); } } // Check if pattern is in the beginning foreach (var patKVP in patternHashes) { if (patKVP.Value == targetHashes[patKVP.Key.Length]) if (string.Equals(target.Substring(0, patKVP.Key.Length), patKVP.Key)) matches[patKVP.Key].Add(0); } // Hashing the rest of the target and searching for the pattern // Patters are grouped by their length foreach (var patternsWithSpecificLength in patternLengths) { int patternLength = patternsWithSpecificLength.Key; int endOfSearch = target.Length - patternLength; for (int i = 0; i < endOfSearch; i++) { ulong targetHash = targetHashes[patternLength]; // Some Rabin-Karp magic targetHash = (targetHash + moduloValue - pows[patternLength] * target[i] % moduloValue) % moduloValue; targetHash = (targetHash * alphabetSize + target[i + patternLength]) % moduloValue; targetHashes[patternLength] = targetHash; // Search all patterns for a match foreach (var pat in patternsWithSpecificLength.Value) { // If the hashes are equal check the string( because collisions are possible) and return if found if (targetHash == patternHashes[pat]) if (string.Equals(target.Substring(i + 1, patternLength), pat)) matches[pat].Add(i + 1); } } } // Remove all patterns that are not found for (int i = 0; i < patterns.Count; i++) { if (matches[patterns[i]].Count == 0) { matches.Remove(patterns[i]); } } // Return matches return matches; } } }