c++ - 在 C/C++ 中进行不区分大小写的子字符串搜索的最快方法?

标签 c++ c optimization string glibc

注意事项

下面的问题是在 2008 年就 2003 年的一些代码提出的。正如 OP 的更新 所示,整篇文章已被 2008 年的老式算法所淘汰,并且仅作为历史好奇而保留在这里。


我需要在 C/C++ 中进行不区分大小写的快速子字符串搜索。我的要求如下:

  • 应该表现得像 strstr()(即返回一个指向匹配点的指针)。
  • 必须不区分大小写 (doh)。
  • 必须支持当前的语言环境。
  • 必须在 Windows (MSVC++ 8.0) 上可用或可轻松移植到 Windows(即来自开源库)。

这是我正在使用的当前实现(取自 GNU C 库):

/* Return the offset of one string within another.
   Copyright (C) 1994,1996,1997,1998,1999,2000 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */

/*
 * My personal strstr() implementation that beats most other algorithms.
 * Until someone tells me otherwise, I assume that this is the
 * fastest implementation of strstr() in C.
 * I deliberately chose not to comment it.  You should have at least
 * as much fun trying to understand it, as I had to write it :-).
 *
 * Stephen R. van den Berg, berg@pool.informatik.rwth-aachen.de */

/*
 * Modified to use table lookup instead of tolower(), since tolower() isn't
 * worth s*** on Windows.
 *
 * -- Anders Sandvig (anders@wincue.org)
 */

#if HAVE_CONFIG_H
# include <config.h>
#endif

#include <ctype.h>
#include <string.h>

typedef unsigned chartype;

char char_table[256];

void init_stristr(void)
{
  int i;
  char string[2];

  string[1] = '\0';
  for (i = 0; i < 256; i++)
  {
    string[0] = i;
    _strlwr(string);
    char_table[i] = string[0];
  }
}

#define my_tolower(a) ((chartype) char_table[a])

char *
my_stristr (phaystack, pneedle)
     const char *phaystack;
     const char *pneedle;
{
  register const unsigned char *haystack, *needle;
  register chartype b, c;

  haystack = (const unsigned char *) phaystack;
  needle = (const unsigned char *) pneedle;

  b = my_tolower (*needle); 
  if (b != '\0')
  {
    haystack--;             /* possible ANSI violation */
    do
      {
        c = *++haystack;
        if (c == '\0')
          goto ret0;
      }
    while (my_tolower (c) != (int) b);

    c = my_tolower (*++needle);
    if (c == '\0')
        goto foundneedle;

    ++needle;
    goto jin;

    for (;;)
    {
      register chartype a;
        register const unsigned char *rhaystack, *rneedle;

        do
        {
          a = *++haystack;
          if (a == '\0')
              goto ret0;
          if (my_tolower (a) == (int) b)
              break;
          a = *++haystack;
          if (a == '\0')
              goto ret0;
        shloop:
          ;
        }
      while (my_tolower (a) != (int) b);

jin:      
      a = *++haystack;
      if (a == '\0')
          goto ret0;

        if (my_tolower (a) != (int) c)
          goto shloop;

        rhaystack = haystack-- + 1;
        rneedle = needle;

        a = my_tolower (*rneedle);

        if (my_tolower (*rhaystack) == (int) a)
          do
          {
              if (a == '\0')
                goto foundneedle;

              ++rhaystack;
          a = my_tolower (*++needle);
              if (my_tolower (*rhaystack) != (int) a)
                break;

          if (a == '\0')
                goto foundneedle;

          ++rhaystack;
              a = my_tolower (*++needle);
          }
          while (my_tolower (*rhaystack) == (int) a);

        needle = rneedle;       /* took the register-poor approach */

      if (a == '\0')
          break;
    }
  }
foundneedle:
  return (char*) haystack;
ret0:
  return 0;
}

您能否使这段代码更快,或者您知道更好的实现方式吗?

注意:我注意到 GNU C 库现在有 a new implementation of strstr() ,但我不确定将其修改为不区分大小写的难易程度,或者它实际上是否比旧版本更快(在我的情况下)。我还注意到 the old implementation is still used for wide character strings ,所以如果有人知道原因,请分享。

更新

为了清楚起见——以防万一——我没有编写这个函数,它是 GNU C 库的一部分。我只是修改它不区分大小写。

另外,感谢您提供有关 strcasetr() 的提示,并检查来自其他来源(如 OpenBSD、FreeBSD 等)的其他实现。这似乎是要走的路。上面的代码来自 2003 年,这就是我将其发布在这里的原因,希望有更好的版本可用,显然是这样。 :)

最佳答案

您发布的代码大约是 strcasetr 的一半。

$ gcc -Wall -o my_stristr my_stristr.c
steve@solaris:~/code/tmp
$ gcc -Wall -o strcasestr strcasestr.c 
steve@solaris:~/code/tmp
$ ./bench ./my_stristr > my_stristr.result ; ./bench ./strcasestr > strcasestr.result;
steve@solaris:~/code/tmp
$ cat my_stristr.result 
run 1... time = 6.32
run 2... time = 6.31
run 3... time = 6.31
run 4... time = 6.31
run 5... time = 6.32
run 6... time = 6.31
run 7... time = 6.31
run 8... time = 6.31
run 9... time = 6.31
run 10... time = 6.31
average user time over 10 runs = 6.3120
steve@solaris:~/code/tmp
$ cat strcasestr.result 
run 1... time = 3.82
run 2... time = 3.82
run 3... time = 3.82
run 4... time = 3.82
run 5... time = 3.82
run 6... time = 3.82
run 7... time = 3.82
run 8... time = 3.82
run 9... time = 3.82
run 10... time = 3.82
average user time over 10 runs = 3.8200
steve@solaris:~/code/tmp

main 函数是:

int main(void)
{
        char * needle="hello";
        char haystack[1024];
        int i;

        for(i=0;i<sizeof(haystack)-strlen(needle)-1;++i)
        {
                haystack[i]='A'+i%57;
        }
        memcpy(haystack+i,needle, strlen(needle)+1);
        /*printf("%s\n%d\n", haystack, haystack[strlen(haystack)]);*/
        init_stristr();

        for (i=0;i<1000000;++i)
        {
                /*my_stristr(haystack, needle);*/
                strcasestr(haystack,needle);
        }


        return 0;
}

它经过适当修改以测试这两种实现。我注意到,当我输入这个时,我留在了 init_stristr 调用中,但它不应该改变太多。 bench 只是一个简单的 shell 脚本:

#!/bin/bash
function bc_calc()
{
        echo $(echo "scale=4;$1" | bc)
}
time="/usr/bin/time -p"
prog="$1"
accum=0
runs=10
for a in $(jot $runs 1 $runs)
do
        echo -n "run $a... "
        t=$($time $prog 2>&1| grep user | awk '{print $2}')
        echo "time = $t"
        accum=$(bc_calc "$accum+$t")
done

echo -n "average user time over $runs runs = "
echo $(bc_calc "$accum/$runs")

关于c++ - 在 C/C++ 中进行不区分大小写的子字符串搜索的最快方法?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/211535/

相关文章:

c - scanf 与 ^ 运算符

c - 试图将阶乘部分变成另一个函数

algorithm - 计算最小值和最大值之间的点的最快方法

c++ - 2D 图形性能提升

c++ - 让指针回到第一行 - QFile

c++ - 编译时检查的字符串到整数映射是否可能?

mysql - 优化查询-mysql

C++ 加减 100 位数字

计算不重叠的连续 1 位

Java - 在持久性媒体中存储整数的快速方法