C++ 不使用正则表达式的多个字符串匹配

标签 c++ regex string

我正在尝试编辑现有的 C++ 程序来搜索文本中的多个字符串而不是单个字符串。我是一个完全的 C++ 菜鸟,所以我希望有人能给我指出一个可以工作并且快速的函数。我尝试使用正则表达式 ( see my earlier post ),这有效,但是正则表达式对于我的目的来说太慢了。获取以下代码(由 @WiktorStribiżew 提供):

#include <iostream>
#include <regex>
using namespace std;

int main() {
    std::string pattern("A|D");         // Regex expression
    std::regex rx(pattern);             // Getting the regex object 

    std::string s("ABCDEABCABD");       // Defining the string input
    std::ptrdiff_t number_of_matches = std::distance(  // Count the number of matches inside the iterator
        std::sregex_iterator(s.begin(), s.end(), rx),
        std::sregex_iterator());

    std::cout << number_of_matches << std::endl;  // Displaying results
    return 0;
}

这会输出想要的答案 (5),但对于我的目的来说太慢了(搜索 ~100+ GB 文件中的每一行)。我怎样才能让它更快?输入主题字符串 s 是从文件中读取的,查询字符串 pattern 在命令行上提供,因此它作为字符串出现(通过管道分隔为 A|D ,例如)。

最佳答案

正则表达式和性能相互咬合。
我重写了您的示例以不使用 Regex,并编译了两个版本(在 OS X 上,使用 g++ 5.3.0 和 -O3 -fwhole-program ):

#include <iostream>
using namespace std;

#define NUM_PATTERNS 2

int main() {
    std::string patterns[] = {std::string("A"), std::string("D")};
    std::string str("ABCDEABCABD");
    unsigned int count = 0;

    for(int i = 0; i < NUM_PATTERNS; ++i)
    {
        for(int pos = 0; ; )
        {
            pos = str.find(patterns[i], pos);
            if(pos == std::string::npos)
            {
                break;
            }
            ++pos;      // in order to not match again
            ++count;
        }
    }

    std::cout << count << std::endl;  // Displaying results
    return 0;
}

生成的程序集有 545 行:

    .cstring
    .align 3
LC0:
    .ascii "basic_string::_M_construct null not valid\0"
    .section __TEXT,__text_cold,regular,pure_instructions
    .align 1
LCOLDB1:
    .text
LHOTB1:
    .align 1,0x90
    .align 4,0x90
__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18:
LFB1645:
    pushq   %r13
LCFI0:
    pushq   %r12
LCFI1:
    leaq    16(%rdi), %r12
    pushq   %rbp
LCFI2:
    pushq   %rbx
LCFI3:
    subq    $24, %rsp
LCFI4:
    testq   %rsi, %rsi
    movq    %r12, (%rdi)
    je  L2
    movq    %rdi, %rbx
    movq    %rsi, %rdi
    movq    %rsi, %r13
    call    _strlen
    cmpq    $15, %rax
    movq    %rax, %rbp
    movq    %rax, 8(%rsp)
    ja  L13
    cmpq    $1, %rax
    je  L14
    testq   %rax, %rax
    jne L15
L6:
    movq    8(%rsp), %rax
    movq    (%rbx), %rdx
    movq    %rax, 8(%rbx)
    movb    $0, (%rdx,%rax)
    addq    $24, %rsp
LCFI5:
    popq    %rbx
LCFI6:
    popq    %rbp
LCFI7:
    popq    %r12
LCFI8:
    popq    %r13
LCFI9:
    ret
L13:
LCFI10:
    leaq    8(%rsp), %rsi
    xorl    %edx, %edx
    movq    %rbx, %rdi
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm
    movq    8(%rsp), %rdx
    movq    %rax, (%rbx)
    movq    %rax, %rdi
    movq    %rdx, 16(%rbx)
L4:
    movq    %rbp, %rdx
    movq    %r13, %rsi
    call    _memcpy
    jmp L6
L14:
    movzbl  0(%r13), %eax
    movb    %al, 16(%rbx)
    jmp L6
L2:
    leaq    LC0(%rip), %rdi
    call    __ZSt19__throw_logic_errorPKc
L15:
    movq    %r12, %rdi
    jmp L4
LFE1645:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE1:
    .text
LHOTE1:
    .cstring
LC2:
    .ascii "A\0"
LC3:
    .ascii "D\0"
LC4:
    .ascii "ABCDEABCABD\0"
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB5:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTB5:
    .align 4
    .globl _main
_main:
LFB1425:
    pushq   %r15
LCFI11:
    leaq    LC2(%rip), %rsi
    pushq   %r14
LCFI12:
    pushq   %r13
LCFI13:
    pushq   %r12
LCFI14:
    pushq   %rbp
LCFI15:
    pushq   %rbx
LCFI16:
    subq    $104, %rsp
LCFI17:
    leaq    32(%rsp), %rbp
    movq    %rbp, %rdi
LEHB0:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE0:
    leaq    32(%rbp), %rdi
    leaq    LC3(%rip), %rsi
LEHB1:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE1:
    leaq    LC4(%rip), %rsi
    movq    %rsp, %rdi
    movq    %rsp, %r12
LEHB2:
    call    __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcRKS3_.isra.18
LEHE2:
    leaq    8(%rbp), %r14
    xorl    %r15d, %r15d
    xorl    %ebx, %ebx
L22:
    leaq    (%r15,%r14), %r13
    xorl    %eax, %eax
    jmp L21
    .align 4
L44:
    addl    $1, %eax
    addl    $1, %ebx
L21:
    movq    0(%rbp,%r15), %rsi
    movslq  %eax, %rdx
    movq    %r12, %rdi
    movq    0(%r13), %rcx
    call    __ZNKSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE4findEPKcmm
    cmpl    $-1, %eax
    jne L44
    addq    $32, %r15
    cmpq    $64, %r15
    jne L22
    movq    __ZSt4cout@GOTPCREL(%rip), %rdi
    movl    %ebx, %esi
LEHB3:
    call    __ZNSo9_M_insertImEERSoT_
    movq    %rax, %rdi
    call    __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
LEHE3:
    movq    (%rsp), %rdi
    addq    $16, %r12
    cmpq    %r12, %rdi
    je  L23
    call    __ZdlPv
L23:
    movq    64(%rsp), %rdi
    leaq    48(%rbp), %rax
    cmpq    %rax, %rdi
    je  L24
    call    __ZdlPv
L24:
    movq    32(%rsp), %rdi
    addq    $16, %rbp
    cmpq    %rbp, %rdi
    je  L39
    call    __ZdlPv
L39:
    addq    $104, %rsp
LCFI18:
    xorl    %eax, %eax
    popq    %rbx
LCFI19:
    popq    %rbp
LCFI20:
    popq    %r12
LCFI21:
    popq    %r13
LCFI22:
    popq    %r14
LCFI23:
    popq    %r15
LCFI24:
    ret
L38:
LCFI25:
    movq    %rax, %r12
    movl    $1, %eax
L19:
    movl    $1, %ebx
    subq    %rax, %rbx
    salq    $5, %rbx
    addq    %rbp, %rbx
L29:
    cmpq    %rbp, %rbx
    je  L27
    subq    $32, %rbx
    movq    (%rbx), %rdi
    leaq    16(%rbx), %rax
    cmpq    %rax, %rdi
    je  L29
    call    __ZdlPv
    jmp L29
L36:
    movq    (%rsp), %rdi
    addq    $16, %r12
    movq    %rax, %rbx
    cmpq    %r12, %rdi
    je  L32
    call    __ZdlPv
L32:
    movq    64(%rsp), %rdi
    leaq    48(%rbp), %rax
    cmpq    %rax, %rdi
    je  L33
    call    __ZdlPv
L33:
    movq    32(%rsp), %rdi
    addq    $16, %rbp
    cmpq    %rbp, %rdi
    je  L34
    call    __ZdlPv
L34:
    movq    %rbx, %rdi
LEHB4:
    call    __Unwind_Resume
L27:
    movq    %r12, %rdi
    call    __Unwind_Resume
LEHE4:
L37:
    movq    %rax, %rbx
    jmp L32
L35:
    movq    %rax, %r12
    xorl    %eax, %eax
    jmp L19
LFE1425:
    .section __DATA,__gcc_except_tab
GCC_except_table0:
LLSDA1425:
    .byte   0xff
    .byte   0xff
    .byte   0x3
    .byte   0x41
    .set L$set$0,LEHB0-LFB1425
    .long L$set$0
    .set L$set$1,LEHE0-LEHB0
    .long L$set$1
    .set L$set$2,L38-LFB1425
    .long L$set$2
    .byte   0
    .set L$set$3,LEHB1-LFB1425
    .long L$set$3
    .set L$set$4,LEHE1-LEHB1
    .long L$set$4
    .set L$set$5,L35-LFB1425
    .long L$set$5
    .byte   0
    .set L$set$6,LEHB2-LFB1425
    .long L$set$6
    .set L$set$7,LEHE2-LEHB2
    .long L$set$7
    .set L$set$8,L37-LFB1425
    .long L$set$8
    .byte   0
    .set L$set$9,LEHB3-LFB1425
    .long L$set$9
    .set L$set$10,LEHE3-LEHB3
    .long L$set$10
    .set L$set$11,L36-LFB1425
    .long L$set$11
    .byte   0
    .set L$set$12,LEHB4-LFB1425
    .long L$set$12
    .set L$set$13,LEHE4-LEHB4
    .long L$set$13
    .long   0
    .byte   0
    .section __TEXT,__text_startup,regular,pure_instructions
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE5:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTE5:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB6:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTB6:
    .align 4
__GLOBAL__sub_I_test1.cpp:
LFB1626:
    leaq    __ZStL8__ioinit(%rip), %rdi
    subq    $8, %rsp
LCFI26:
    call    __ZNSt8ios_base4InitC1Ev
    movq    __ZNSt8ios_base4InitD1Ev@GOTPCREL(%rip), %rdi
    leaq    ___dso_handle(%rip), %rdx
    addq    $8, %rsp
LCFI27:
    leaq    __ZStL8__ioinit(%rip), %rsi
    jmp ___cxa_atexit
LFE1626:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE6:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTE6:
    .static_data
__ZStL8__ioinit:
    .space  1
    .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame1:
    .set L$set$14,LECIE1-LSCIE1
    .long L$set$14
LSCIE1:
    .long   0
    .byte   0x1
    .ascii "zPLR\0"
    .byte   0x1
    .byte   0x78
    .byte   0x10
    .byte   0x7
    .byte   0x9b
    .long   ___gxx_personality_v0+4@GOTPCREL
    .byte   0x10
    .byte   0x10
    .byte   0xc
    .byte   0x7
    .byte   0x8
    .byte   0x90
    .byte   0x1
    .align 3
LECIE1:
LSFDE1:
    .set L$set$15,LEFDE1-LASFDE1
    .long L$set$15
LASFDE1:
    .long   LASFDE1-EH_frame1
    .quad   LFB1645-.
    .set L$set$16,LFE1645-LFB1645
    .quad L$set$16
    .byte   0x8
    .quad   0
    .byte   0x4
    .set L$set$17,LCFI0-LFB1645
    .long L$set$17
    .byte   0xe
    .byte   0x10
    .byte   0x8d
    .byte   0x2
    .byte   0x4
    .set L$set$18,LCFI1-LCFI0
    .long L$set$18
    .byte   0xe
    .byte   0x18
    .byte   0x8c
    .byte   0x3
    .byte   0x4
    .set L$set$19,LCFI2-LCFI1
    .long L$set$19
    .byte   0xe
    .byte   0x20
    .byte   0x86
    .byte   0x4
    .byte   0x4
    .set L$set$20,LCFI3-LCFI2
    .long L$set$20
    .byte   0xe
    .byte   0x28
    .byte   0x83
    .byte   0x5
    .byte   0x4
    .set L$set$21,LCFI4-LCFI3
    .long L$set$21
    .byte   0xe
    .byte   0x40
    .byte   0x4
    .set L$set$22,LCFI5-LCFI4
    .long L$set$22
    .byte   0xa
    .byte   0xe
    .byte   0x28
    .byte   0x4
    .set L$set$23,LCFI6-LCFI5
    .long L$set$23
    .byte   0xe
    .byte   0x20
    .byte   0x4
    .set L$set$24,LCFI7-LCFI6
    .long L$set$24
    .byte   0xe
    .byte   0x18
    .byte   0x4
    .set L$set$25,LCFI8-LCFI7
    .long L$set$25
    .byte   0xe
    .byte   0x10
    .byte   0x4
    .set L$set$26,LCFI9-LCFI8
    .long L$set$26
    .byte   0xe
    .byte   0x8
    .byte   0x4
    .set L$set$27,LCFI10-LCFI9
    .long L$set$27
    .byte   0xb
    .align 3
LEFDE1:
LSFDE3:
    .set L$set$28,LEFDE3-LASFDE3
    .long L$set$28
LASFDE3:
    .long   LASFDE3-EH_frame1
    .quad   LFB1425-.
    .set L$set$29,LFE1425-LFB1425
    .quad L$set$29
    .byte   0x8
    .quad   LLSDA1425-.
    .byte   0x4
    .set L$set$30,LCFI11-LFB1425
    .long L$set$30
    .byte   0xe
    .byte   0x10
    .byte   0x8f
    .byte   0x2
    .byte   0x4
    .set L$set$31,LCFI12-LCFI11
    .long L$set$31
    .byte   0xe
    .byte   0x18
    .byte   0x8e
    .byte   0x3
    .byte   0x4
    .set L$set$32,LCFI13-LCFI12
    .long L$set$32
    .byte   0xe
    .byte   0x20
    .byte   0x8d
    .byte   0x4
    .byte   0x4
    .set L$set$33,LCFI14-LCFI13
    .long L$set$33
    .byte   0xe
    .byte   0x28
    .byte   0x8c
    .byte   0x5
    .byte   0x4
    .set L$set$34,LCFI15-LCFI14
    .long L$set$34
    .byte   0xe
    .byte   0x30
    .byte   0x86
    .byte   0x6
    .byte   0x4
    .set L$set$35,LCFI16-LCFI15
    .long L$set$35
    .byte   0xe
    .byte   0x38
    .byte   0x83
    .byte   0x7
    .byte   0x4
    .set L$set$36,LCFI17-LCFI16
    .long L$set$36
    .byte   0xe
    .byte   0xa0,0x1
    .byte   0x4
    .set L$set$37,LCFI18-LCFI17
    .long L$set$37
    .byte   0xa
    .byte   0xe
    .byte   0x38
    .byte   0x4
    .set L$set$38,LCFI19-LCFI18
    .long L$set$38
    .byte   0xe
    .byte   0x30
    .byte   0x4
    .set L$set$39,LCFI20-LCFI19
    .long L$set$39
    .byte   0xe
    .byte   0x28
    .byte   0x4
    .set L$set$40,LCFI21-LCFI20
    .long L$set$40
    .byte   0xe
    .byte   0x20
    .byte   0x4
    .set L$set$41,LCFI22-LCFI21
    .long L$set$41
    .byte   0xe
    .byte   0x18
    .byte   0x4
    .set L$set$42,LCFI23-LCFI22
    .long L$set$42
    .byte   0xe
    .byte   0x10
    .byte   0x4
    .set L$set$43,LCFI24-LCFI23
    .long L$set$43
    .byte   0xe
    .byte   0x8
    .byte   0x4
    .set L$set$44,LCFI25-LCFI24
    .long L$set$44
    .byte   0xb
    .align 3
LEFDE3:
LSFDE5:
    .set L$set$45,LEFDE5-LASFDE5
    .long L$set$45
LASFDE5:
    .long   LASFDE5-EH_frame1
    .quad   LFB1626-.
    .set L$set$46,LFE1626-LFB1626
    .quad L$set$46
    .byte   0x8
    .quad   0
    .byte   0x4
    .set L$set$47,LCFI26-LFB1626
    .long L$set$47
    .byte   0xe
    .byte   0x10
    .byte   0x4
    .set L$set$48,LCFI27-LCFI26
    .long L$set$48
    .byte   0xe
    .byte   0x8
    .align 3
LEFDE5:
    .mod_init_func
    .align 3
    .quad   __GLOBAL__sub_I_test1.cpp
    .constructor
    .destructor
    .align 1
    .subsections_via_symbols

您的示例中的程序集有 31,559 行,尝试将其发布到此处或 Pastebin 上导致我的浏览器选项卡崩溃。

不过,我还编写了上述程序的 C 版本:

#include <stdio.h>
#include <string.h>

#define NUM_PATTERNS 2

int main()
{
    const char *patterns[] = {"A", "D"};
    char *str = "ABCDEABCABD";
    unsigned int count = 0;

    for(int i = 0; i < NUM_PATTERNS; ++i)
    {
        for(char *s = str; ; )
        {
            s = strstr(s, patterns[i]);
            if(s == NULL)
            {
                break;
            }
            ++s;        // in order to not match again
            ++count;
        }
    }
    printf("%u\n", count);
    return 0;
}

编译后只有 95 行汇编代码:

    .cstring
LC0:
    .ascii "ABCDEABCABD\0"
LC1:
    .ascii "%u\12\0"
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB2:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTB2:
    .align 4
    .globl _main
_main:
LFB1:
    pushq   %rbx
LCFI0:
    leaq    LC0(%rip), %rdi
    xorl    %ebx, %ebx
    jmp L3
    .align 4
L8:
    leaq    1(%rax), %rdi
    addl    $1, %ebx
L3:
    movl    $65, %esi
    call    _strchr
    testq   %rax, %rax
    jne L8
    leaq    LC0(%rip), %rdi
    jmp L2
    .align 4
L9:
    leaq    1(%rax), %rdi
    addl    $1, %ebx
L2:
    movl    $68, %esi
    call    _strchr
    testq   %rax, %rax
    jne L9
    leaq    LC1(%rip), %rdi
    movl    %ebx, %esi
    xorl    %eax, %eax
    call    _printf
    xorl    %eax, %eax
    popq    %rbx
LCFI1:
    ret
LFE1:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE2:
    .section __TEXT,__text_startup,regular,pure_instructions
LHOTE2:
    .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame1:
    .set L$set$0,LECIE1-LSCIE1
    .long L$set$0
LSCIE1:
    .long   0
    .byte   0x1
    .ascii "zR\0"
    .byte   0x1
    .byte   0x78
    .byte   0x10
    .byte   0x1
    .byte   0x10
    .byte   0xc
    .byte   0x7
    .byte   0x8
    .byte   0x90
    .byte   0x1
    .align 3
LECIE1:
LSFDE1:
    .set L$set$1,LEFDE1-LASFDE1
    .long L$set$1
LASFDE1:
    .long   LASFDE1-EH_frame1
    .quad   LFB1-.
    .set L$set$2,LFE1-LFB1
    .quad L$set$2
    .byte   0
    .byte   0x4
    .set L$set$3,LCFI0-LFB1
    .long L$set$3
    .byte   0xe
    .byte   0x10
    .byte   0x83
    .byte   0x2
    .byte   0x4
    .set L$set$4,LCFI1-LCFI0
    .long L$set$4
    .byte   0xe
    .byte   0x8
    .align 3
LEFDE1:
    .subsections_via_symbols

我建议您至少将这部分代码转换为 C。如果您有周围的 C++ 代码库,那么嵌入它应该不会有太大问题。

无论哪种方式,您都可以采用上述示例之一,更改 str要从文件中读取,请更改 patterns根据需要将数组设置为尽可能多的模式(确保将 NUM_PATTERNS 设置为相同的数量!),如果这仍然不足以提高性能,请实现将循环拆分为多个线程(这个主题太宽泛了)尽管在这个答案中涵盖了 C 和 C++)。

您当然也可以将命令行参数解析为模式数组:

#include <stdlib.h>
#include <string.h>

int numPatterns(char *str)
{
    int num = 1;
    for(int i = 0, len = strlen(str); i < len; ++i) // loop through all characters of the string
    {
        if(str[i] == '|')                           // increase the counter each time we hit a |
        {
            ++num;
        }
    }
    return num;
}

void splitPatterns(char *str, int num, char** dst)
{
    char *raw = (char*)(dst + num * sizeof(char*)); // this is the region after the pointers
    strcpy(raw, str);                               // first copy the string (note, this function is dangerous if used on anything other than strings! (google buffer overflow))
    dst[0] = raw;
    for(int i = 0, len = strlen(str), n = 1; i < len; ++i)
    {
        if(raw[i] == '|')
        {
            raw[i] = '\0';                          // replace all | by '\0' in order to split
            dst[n++] = &raw[i + 1];                 // and let the next string start at the next character
        }
    }
}

// let's say your program is called as ./program 'input_file' 'list|of|patterns'
int main(int argc, char **argv)
{
    /* check that argc >= 3 and blah */

    int num = numPatterns(argv[2]);
    // In order to malloc() and free() only once, I'm stashing things together here.
    // First comes an array of num pointers, hence num * sizeof(char*), followed by
    // the strings which take up the same space as argv[2] (the | just gets
    // replaced by \0), which equals strlen(argv[2]) + 1 for the terminating null byte.
    char **patterns = (char**)malloc(num * sizeof(char*) + strlen(argv[2]) + 1);
    splitPatterns(argv[2], num, patterns);

    /* work with patterns here */

    free(patterns);
}

如果您将此处的 C 示例编译为 C++,请确保更改 <*.h> 中的所有包含指令。至<c*> ,即<stdio.h><cstdio>等等

关于C++ 不使用正则表达式的多个字符串匹配,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36360005/

相关文章:

string - excel中的iserror公式

java - 如何在 Java 中比较字符串?

c++ - 图像在本地主机上显示已损坏

c++ - C/C++ : How does int array[10]={0} work?

python-3.x - 正则表达式从文本文件中捕获包含制表符/空格和子字符串的字符串部分

regex - AWK,如果特定列的模式匹配则拆分文件中的内容

java - 如何获取新替换字符串的新值?

c++ - 终端中光标闪烁的删除,如何?

c++ - 无法解析 NTSTATUS

java - 正则表达式java中方括号外的Charat运算符