c - 为 Flex 定义了 C token 文件?

标签 c tokenize flex-lexer

我想将一个C文件分割成标记,不是为了编译而是为了分析。我觉得这应该非常简单,并尝试在线查找已定义的所有 C 语法的 Flex 文件,但找不到任何内容。我想知道是否存在任何类型的定义语法,或者我是否认为这一切都是错误的?

最佳答案

是的,至少有one周围。

编辑:

由于有一些问题无法处理,也许值得看看我几年前编写的一些(手写的)词法分析代码。这基本上只处理翻译的第 1、2 和 3 阶段。如果定义 DIGRAPH,它还会打开一些代码来翻译 C++ 二合字母。然而,如果没记错的话,它在翻译中的执行时间比实际发生的时间要早​​,但无论如何您可能都不希望这样做。 OTOH,这甚至不尝试识别任何接近所有标记的地方——主要是将源分为注释、字 rune 字、字符串文字和几乎所有其他内容。 OTOH,它确实可以处理三字母、线条拼接等。

我想我还应该补充一点,通过在翻译(文本)模式下打开文件,这会将平台的行结束字符转换为换行符到底层实现。在大多数情况下,这可能是正确的做法,但如果您想生成类似交叉编译器的东西,其中您的源文件具有与该主机正常情况不同的行结束序列,您可能必须更改它。

首先是定义所有这些东西的外部接口(interface)的 header :

/* get_src.h */   
#ifndef GET_SRC_INCLUDED
#define GET_SRC_INCLUDED

#include <stdio.h>

#ifdef __cplusplus
extern "C" {
#endif

/* This is the size of the largest token we'll attempt to deal with.  If
 * you want to deal with bigger tokens, change this, and recompile
 * get_src.c.  Note that an entire comment is treated as a single token,
 * so long comments could overflow this.  In case of an overflow, the
 * entire comment will be read as a single token, but the part larger
 * than this will not be stored.
 */
#define MAX_TOKEN_SIZE 8192

/* `last_token' will contain the text of the most recently read token (comment,
 * string literal, or character literal).
 */
extern char last_token[];

/* This is the maximum number of characters that can be put back into a
 * file opened with parse_fopen or parse_fdopen.
 */
#define MAX_UNGETS 5

#include <limits.h>
#include <stdio.h>

typedef struct {
    FILE *file;
    char peeks[MAX_UNGETS];
    int last_peek;
} PFILE;

/* Some codes we return to indicate having found various items in the
 * source code.  ERROR is returned to indicate a newline found in the
 * middle of a character or string literal or if a file ends inside a
 * comment, or if a character literal contains more than two characters.
 *
 * Note that this starts at INT_MIN, the most negative number available
 * in an int.  This keeps these symbols from conflicting with any
 * characters read from the file.  However, one of these could
 * theoretically conflict with EOF.  EOF usually -1, and these are far
 * more negative than that.  However, officially EOF can be any value
 * less than 0...
 */
enum {
    ERROR = INT_MIN,
    COMMENT,
    CHAR_LIT,
    STR_LIT
};

/* Opens a file for parsing and returns a pointer to a structure which
 * can be passed to the other functions in the parser/lexer to identify
 * the file being worked with.
 */
PFILE *parse_fopen(char const *name);

/* This corresponds closely to fdopen - it takes a FILE * as its
 * only parameter, creates a PFILE structure identifying that file, and
 * returns a pointer to that structure.
 */
PFILE *parse_ffopen(FILE *stream);

/* Corresponds to fclose.
 */
int parse_fclose(PFILE *stream);

/* returns characters from `stream' read as C source code.  String
 * literals, characters literals and comments are each returned as a
 * single code from those above.  All strings of any kind of whitespace
 * are returned as a single space character.
 */
int get_source(PFILE *stream);

/* Basically, these two work just like the normal versions of the same,
 * with the minor exception that unget_character can unget more than one
 * character.
 */
int get_character(PFILE *stream);
void unget_character(int ch, PFILE *stream);

#ifdef __cplusplus
}
#endif

#endif

然后执行所有这些:

/* get_src.c */
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>

#define GET_SOURCE
#include "get_src.h"

static size_t current = 0;

char last_token[MAX_TOKEN_SIZE];

PFILE *parse_fopen(char const *name) {

    PFILE *temp = malloc(sizeof(PFILE));

    if ( NULL != temp ) {
        temp->file = fopen(name, "r");
        memset(temp->peeks, 0, sizeof(temp->peeks));
        temp->last_peek = 0;
    }
    return temp;
}

PFILE *parse_ffopen(FILE *file) {

    PFILE *temp = malloc(sizeof(PFILE));

    if ( NULL != temp) {
        temp->file = file;
        memset(temp->peeks, 0, sizeof(temp->peeks));
        temp->last_peek = 0;
    }
    return temp;
}

int parse_fclose(PFILE *stream) {

    int retval = fclose(stream->file);

    free(stream);
    return retval;
}

static void addchar(int ch) {
/* adds the passed character to the end of `last_token' */

    if ( current < sizeof(last_token) -1 )
        last_token[current++] = (char)ch;

    if ( current == sizeof(last_token)-1 )
        last_token[current] = '\0';
}

static void clear(void) {
/* clears the previous token and starts building a new one. */
    current = 0;
}

static int read_char(PFILE *stream) {
    if ( stream->last_peek > 0 )
        return stream->peeks[--stream->last_peek];
    return fgetc(stream->file);
}

void unget_character(int ch, PFILE * stream) {
    if ( stream->last_peek < sizeof(stream->peeks) )
        stream->peeks[stream->last_peek++] = ch;
}

static int check_trigraph(PFILE *stream) {
/* Checks for trigraphs and returns the equivalant character if there
 * is one.  Expects that the leading '?' of the trigraph has already
 * been read before this is called.
 */

    int ch;

    if ( '?' != (ch=read_char(stream))) {
        unget_character(ch, stream);
        return '?';
    }

    ch = read_char(stream);

    switch( ch ) {
        case '(':   return '[';
        case ')':   return ']';
        case '/':   return '\\';
        case '\'':  return '^';
        case '<':   return '{';
        case '>':   return '}';
        case '!':   return '|';
        case '-':   return '~';
        case '=':   return '#';
        default:
            unget_character('?', stream);
            unget_character(ch, stream);
            return '?';
    }
}

#ifdef DIGRAPH
static int check_digraph(PFILE *stream, int first) {
/* Checks for a digraph.  The first character of the digraph is
 * transmitted as the second parameter, as there are several possible
 * first characters of a digraph.
 */

    int ch = read_char(stream);

    switch(first) {
        case '<':
            if ( '%' == ch )
                return '{';
            if ( ':' == ch )
                return '[';
            break;
        case ':':
            if ( '>' == ch )
                return ']';
            break;
        case '%':
            if ( '>' == ch )
                return '}';
            if ( ':' == ch )
                return '#';
            break;
    }

/* If it's not one of the specific combos above, return the characters
 * separately and unchanged by putting the second one back into the
 * stream, and returning the first one as-is.
 */
    unget_character(ch, stream);
    return first;
}
#endif


static int get_char(PFILE *stream) {
/* Gets a single character from the stream with any trigraphs or digraphs converted 
 * to the single character represented. Note that handling digraphs this early in
 * translation isn't really correct (and shouldn't happen in C at all).
 */
    int ch = read_char(stream);

    if ( ch == '?' )
        return check_trigraph(stream);

#ifdef DIGRAPH
    if (( ch == '<' || ch == ':' || ch == '%' ))
        return check_digraph(stream, ch);
#endif

    return ch;
}

int get_character(PFILE *stream) {
/* gets a character from `stream'.  Any amount of any kind of whitespace
 * is returned as a single space. Escaped new-lines are "eaten" here as well.
 */
    int ch;

    if ( !isspace(ch=get_char(stream)) && ch != '\\')
        return ch;

    // handle line-slicing
    if (ch == '\\') {
        ch = get_char(stream);
        if (ch == '\n') 
            ch = get_char(stream);
        else {
            unget_character(ch, stream);
            return ch;
        }
    }

    /* If it's a space, skip over consecutive white-space */
    while (isspace(ch) && ('\n' != ch))
        ch = get_char(stream);

    if ('\n' == ch)
        return ch;

    /* Then put the non-ws character back */
    unget_character(ch, stream);

    /* and return a single space character... */
    return ' ';
}

static int read_char_lit(PFILE *stream) {
/* This is used internally by `get_source' (below) - it expects the
 * opening quote of a character literal to have already been read and
 * returns CHAR_LIT or ERROR if there's a newline before a close
 * quote is found, or if the character literal contains more than two
 * characters after escapes are taken into account.
 */

    int ch;
    int i;


    clear();
    addchar('\'');

    for (i=0; i<2 && ('\'' != ( ch = read_char(stream))); i++) {

        addchar(ch);

        if ( ch == '\n' )
            return ERROR;

        if (ch == '\\' ) {
            ch = get_char(stream);
            addchar(ch);
        }
    }
    addchar('\'');
    addchar('\0');

    if ( i > 2 )
        return ERROR;

    return CHAR_LIT;
}

static int read_str_lit(PFILE *stream) {
/* Used internally by get_source.  Expects the opening quote of a string
 * literal to have already been read.  Returns STR_LIT, or ERROR if a
 * un-escaped newline is found before the close quote.
 */

    int ch;

    clear();
    addchar('"');

    while ( '"' != ( ch = get_char(stream))) {

        if ( '\n' == ch || EOF == ch )
            return ERROR;

        addchar(ch);

        if( ch == '\\' ) {
            ch = read_char(stream);
            addchar(ch);
        }

    }

    addchar('"');
    addchar('\0');

    return STR_LIT;
}

static int read_comment(PFILE *stream) {
/* Skips over a comment in stream.  Assumes the leading '/' has already
 * been read and skips over the body.  If we're reading C++ source, skips
 * C++ single line comments as well as normal C comments.
 */
    int ch;

    clear();

    ch = get_char(stream);

    /* Handle a single line comment.
     */
    if ('/' == ch) {
        addchar('/');
        addchar('/');

        while ( '\n' != ( ch = get_char(stream))) 
            addchar(ch);       

        addchar('\0');
        return COMMENT;
    }

    if ('*' != ch ) {
        unget_character(ch, stream);
        return '/';
    }

    addchar('/');

    do {
        addchar(ch);
        while ('*' !=(ch = get_char(stream)))
            if (EOF == ch)
                return ERROR;
            else
                addchar(ch);
        addchar(ch);
    } while ( '/' != (ch=get_char(stream)));

    addchar('/');
    addchar('\0');

    return COMMENT;
}

int get_source(PFILE *stream) {
/* reads and returns a single "item" from the stream.  An "item" is a
 * comment, a literal or a single character after trigraph and possible
 * digraph substitution has taken place.
 */

    int ch = get_character(stream);

    switch(ch) {
        case '\'':
            return read_char_lit(stream);
        case '"':
            return read_str_lit(stream);
        case '/':
            return read_comment(stream);
        default:
            return ch;
    }
}

#ifdef TEST

int main(int argc, char **argv)  {
    PFILE *f;
    int ch;

    if (argc != 2) {
        fprintf(stderr, "Usage: get_src <filename>\n");
        return EXIT_FAILURE;
    }

    if (NULL==(f= parse_fopen(argv[1]))) {
        fprintf(stderr, "Unable to open: %s\n", argv[1]);
        return EXIT_FAILURE;
    }

    while (EOF!=(ch=get_source(f))) 
        if (ch < 0) 
            printf("\n%s\n", last_token);
        else
            printf("%c", ch);
    parse_fclose(f);
    return 0;       
}

#endif

我不确定将其集成到基于 Flex 的词法分析器中会有多容易/困难——我似乎记得 Flex 有某种钩子(Hook)来定义它用来读取字符的内容,但我从未尝试过使用它,所以我不能对它说太多(最终,甚至不能肯定地说它存在)。

关于c - 为 Flex 定义了 C token 文件?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/7051694/

相关文章:

c - 使用 gdb 调试 C 程序

c - 使用 Windows MIDI API 时出现问题(播放时没有回调)

c - MinGW : reading binary data fail

python - 使用 Python NLTK 对大型 (>70MB) TXT 文件进行标记。串联并写入数据以流式传输错误

javascript - Jquery 表单重置

c - 如何在词法分析过程中检测字符串?

c - #define 中的方括号

C++ - 为什么分词器从文件中读取行这么慢?

regex - 匹配文件中最后一个换行符的正则表达式

Bison ,@1 和 $1 之间的差异