Monday, May 9, 2011

Boyer-Moore Horspool return all occurrences in one go

First I would like to say that I'm sorry for not posting for quite some time now, but thanks to Simon H. who found a bug in original algorithm found here, I've managed to also extend the function to return all occurrences of a pattern in a string, without further introduction here's the code!
type
  TFSResults = array of Integer;

function FindStringMulti(const Value, Pattern: string;
  const CaseSensitive: Boolean = True;
  const StartPos: Integer = 1): TFSResults;
var
  Index: Integer;
  jIndex: Integer;
  kIndex: Integer;
  LLenPattern: Integer;
  LLenValue: Integer;
  LSkipTable: array[Char] of Integer;
  LChar: Char;

    function __SameChar: Boolean;
    begin
      if CaseSensitive then
        Result := (Value[Index] = Pattern[jIndex])
      else
        Result := (CompareText(Value[Index], Pattern[jIndex]) = 0);
    end; // function __SameChar: Boolean;

begin
  LLenPattern := Length(Pattern);
  if LLenPattern = 0 then
    Exit;
  for LChar := Low(Char) to High(Char) do
    LSkipTable[LChar] := LLenPattern;
  if CaseSensitive then begin
    for kIndex := 1 to LLenPattern -1 do
      LSkipTable[Pattern[kIndex]] := LLenPattern -kIndex;
  end else begin
    for kIndex := 1 to LLenPattern -1 do
      LSkipTable[Windows.CharLower(@Pattern[kIndex])^] := LLenPattern -kIndex;
  end; // if CaseSensitive then begin
  kIndex := LLenPattern + (StartPos -1);
  LLenValue := Length(Value);
  while (kIndex <= LLenValue) do begin
    Index := kIndex;
    jIndex := LLenPattern;
    while (jIndex >= 1) do begin
      if __SameChar then begin
        jIndex := jIndex -1;
        Index := Index -1;
      end else
        jIndex := -1;
      if jIndex = 0 then begin
        SetLength(Result, Length(Result) +1);
        Result[High(Result)] := Index +1;
        jIndex := -1;
      end; // if jIndex = 0 then begin
      kIndex := kIndex + LSkipTable[Value[kIndex]];
    end; // while (jIndex >= 1) do begin
  end; // while (kIndex <= LLenValue) do begin
end;

Enjoy!

5 comments:

  1. on delphi xe2 update 3 in this line LSkipTable[Windows.CharLower(@Pattern[kIndex])^] := LLenPattern -kIndex; have a problem with CharLower i test on windows 7 access violation at address 76B461B4 in modulo KERNELBASE.dll. Write of address 0073AB6C I DELETE WINDOWS. TO FUNCTION ON XE2 ONLY WRITE Charlower and have this error

    ReplyDelete
  2. @Anonymous I've just tested it on Windows 7 with latests updates, Delphi 2010 it works perfectly, it could be that there's something wrong in Delphi XE2...

    I do not, and will not own a license for Delphi XE 2, maybe next version...

    One way to solve it, might be to define a local variable which will hold the value:
    var
    LSkipChar: Char;
    begin
    ...
    for kIndex := 1 to LLenPattern -1 do
    LSkipChar := Windows.CharLower(@Pattern[kIndex])^;
    LSkipTable[LSkipChar] := LLenPattern -kIndex;
    ...

    Please let me know if this fixes the issue.

    ReplyDelete
  3. my tests have shown that
    kIndex := kIndex + LSkipTable[Value[kIndex]];
    should be
    kIndex := kIndex + 1;
    otherwhise the number of occurences is arbitrary, depending on the positions of the Pattern within the Value

    ReplyDelete
    Replies
    1. Hey, it would be helpful if you'd post your test cases so I can look into it, I have little time for the blog lately...

      Delete
    2. Hey,
      I dropped the next few lines in a TMemo and searched its text for "Pattern"
      The search missed the first occurence, in line
      " if LLenPattern = LLenSource then // strings are equal"
      The dubugger showed that kIndex went to the space right after "LLenPattern"
      Hope it helps...
      (ps. all lines are indented with two spaces. If not, it works fine.)

      function CompliesWithWholeWordsRule(Index: Integer): Boolean;
      const
      WholeWordsRule = ['a'..'z', 'A'..'Z', '_', '0'..'9'];
      begin
      if WholeWords then
      begin
      if LLenPattern = LLenSource then // strings are equal
      Result := True
      else if Index = 1 then // pattern found at the beginning
      // char after pattern length must not be a letter
      Result := not (SourceString[Index + LLenPattern] in WholeWordsRule)
      else if Index + LLenPattern - 1 = LLenSource then // pattern found at the end
      // char before index must not be a letter
      Result := not (SourceString[Index - 1] in WholeWordsRule)
      else // pattern found in the middle
      // chars before index and after pattern length must not be letters
      Result := not ( (SourceString[Index + LLenPattern] in WholeWordsRule)
      or (SourceString[Index - 1] in WholeWordsRule));
      end {if}
      else // no WholeWords rule
      Result := True;
      end; {CompliesWithWholeWordsRule}

      Delete

Blogroll(General programming and Delphi feeds)