c - 在帕特里夏/基数树上插入单词时遇到问题

标签 c radix recursive-datastructures patricia-trie

我正在尝试为帕特里夏/基数树的大学应用编写代码,该代码插入从树中的txt文件中的每一行读取的单词,因此如果我读取一个文件,内容如下:

roman
romance
romantic

它应该像这样实现

    (root)
      |
    roman$
   /     \
 ce$      tic$

我还添加了检查点,这样我就可以测试程序在死亡之前能走多远 所以我的结构设置如下:

typedef struct no no; // declaration of the struct no

typedef struct no {
    char *str; // pointer to store a string
    struct no *child[26]; // Array of pointers to all possible children of the tree
    int isword; // Flag to know if its a full word
    int isroot; // Flag to know if its the root of the tree
} tree;

和我的 insertintree 声明:

no* InsertWord(no *r, char *str){
    printf("checkpoint 5 %s\n", str);
    //caso 1, we are at the root
    if(r->isroot){
        r->child[r->str[0] - 'a'] = InsertWord(r->str[r->str[0] - 'a'], str);
        return r;
    }
    //caso 2, found an empty node
    if(r->str == NULL){
        strcpy(r->str, str);
        r->isword = 1;
        printf("word %s inserted\n", str);
        return r;
    }
    //caso 3, word is a perfect fit for the prefix
    if(strcmp(r->str, str)){
        r->isword = 1;
        printf("word %s inserted\n", str);
        return r;
    }
    int i;
    for(i = 0; str[i] == r->str[i]; i++);
    //caso 4, the prefix fits the first part of the word, but there is still more to be inserted
    if(i == strlen(r->str)){
        r->child[str[i] - 'a'] = InsertWord(r->child[str[i] - 'a'], str + i);
        return r;
    }
    //caso 5, the word fits the prefix, but there is still more prefix
    if(i == strlen(str)){
        char *aux1;
        char *aux2;
        strncpy(aux1, r->str, i);
        strcpy(aux2, r->str + i);
        no *aux = r;
        for(int j = 0; j < 26; j++){
            free(r->filhos[j]);
        }
        r->child[aux2[0] - 'a'] = aux;
        strcpy(r->str, aux1);
        strcpy(r->child[aux2[0] - 'a'], aux2);
        return r;
    }
    //caso 6, the first part of the word fits with the first part of the preix, but then they differ
    if(i < strlen(str) && i < strlen(r->str)){
        char *aux1;
        char *aux2;
        strncpy(aux1, r->str, i);
        strcpy(aux2, r->str + i);
        no *aux = r;
        for(int j = 0; j < 26; j++){
            free(r->child[j]);
        }
        r->child[aux2[0] - 'a'] = aux;
        strcpy(r->str, aux1);
        strcpy(r->child[aux2[0] - 'a'], aux2);
        r->child[str[i] - 'a'] = InsertWord(r->child[str[i] - 'a'], str + i);
        return r;
    }
}

我当前的输出看起来像这样

checkpoint 1
checkpoint 2
checkpoint 3 roman
checkpoint 4 roman
checkpoint 5
checkpoint 5

然后程序就死掉了,检查点 1 到 4 都与读取文件和验证单词的函数相关,而且它们似乎都运行良好

我尝试向程序添加动态分配,似乎对解决问题没有帮助,我知道我应该提供一个最小的可重现示例,但我不太确定问题出在哪里 (代码最初不是英文的,我尽力提供翻译,但如果有任何部分仍然是葡萄牙语,我没有注意到,我很抱歉)

既然有人要求我,我将添加调用此函数的其余函数,它有点长,但在这里

no *ValidateWord(no *r, char *str) {
    int i;
    printf("checkpoint 3 %s\n", str);
    for (i = 0; str[i] != '\0'; i++) {
        if (str[i] < 'A' || (str[i] > 'Z' && str[i] < 'a') || str[i] > 'z') {
            return r;
        } else if (str[i] >= 'A' && str[i] <= 'Z') {
            str[i] = str[i] + 32;
        }
    }
    printf("checkpoint 4 %s\n", str);
    return InsertWord(r, str);
}

no *LoadFile(no *r) {
    printf("checkpoint 1");
    printf("Insert the name of the chosen file: ");
    char namefile[100];
    scanf("%s", namefile);
    FILE *fp1;
    fp1 = fopen(namefile, "r");
    if (fp1 == NULL) {
        printf("couldnt open the file.\n");
        return r;
    }
    char str[100];
    while (fgets(str, sizeof(str), fp1)) {
        printf("checkpoint 2 %s\n", str);
        str[strcspn(str, "\n")] = '\0';
        r = ValidateWord(r, str);
    }
    fclose(fp1);
    return r;
}

这是调用所有这些的主文件

void menu(no* r) {
    int c = 0;
    printf("choose an operation:\n");
    printf("Load File - 1\nCheck Words - 2\nPrint Dictionary - 3\nLoad Stopwords File - 4\nExit - 0\n\n");
    scanf("%d", &c);
    while (c != 0) {
        if (c == 1)
            r = LoadFile(r); // Calls the LoadFile function
        else if (c == 2)
            CheckWords(r); // calls the function CheckWords
        else if (c == 3)
            PrintDictionary(r); // Calls the fucntion PrintDictionary
        else if (c == 4)
            r = LoadStopwordsFile(r); // calls the LoadStopwordsFile function
        printf("choose an operation:\n");
    printf("Load File - 1\nCheck Words - 2\nPrint Dictionary - 3\nLoad Stopwords File - 4\nExit - 0\n\n");
        scanf("%d", &c);
    }
}

int main() {
    no* r = (no*) malloc(sizeof(no)); // alocates memory for the root
    if (r == NULL) {
        printf("Error alocating memory.\n");
        return 1; // closes the aplication
    }
    r->isroot = 1; // defines the isroot flag to true
    menu(&r); // calls the menu function
    free(r); // frees the memory alocated for the root
    return 0;
}

最佳答案

Patrica 树是隐式二叉基数树的一种特殊情况,它比这里的更紧凑(当然是迂腐的。)

我不确定您是否需要isroot;它是干什么用的?空树?

isword$ 的带外符号;您不需要(或想要)同一事物有两个符号。 (以 null 结尾的字符串的自然 trie 条目是使用 '\0' 作为终端后缀 $。)

Morin, Patrick. "Data Structures for Strings"

As an optimization, edge labels can be stored in constant size by using two pointers to a string (for the first and last elements)

另一种选择是不复制字符串,而是仅指向子字符串,而不是复制和操作字符串。

编辑:在Java code in the tutorial中您遵循的,它使用带外 isword 并复制子字符串。看起来您正在分配节点,但不是分配字符串副本,而 Java 会自动处理字符串副本。

我认为以下是一个很好的类比:子字符串存储在 flexible array member 中在struct的末尾;这样,节点和字符串的副本就处于连续的内存分配中,从而减少了碎片和对 malloc 的调用。

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

// A flexible array member for the substring.
struct node {
    struct node *child[26];
    int isword;
    char str[];
};
struct trie { struct node *root; };

// graph helper
static void graph_logic(const struct node *const node,
    const struct node *const parent, FILE *const fp) {
    fprintf(fp,
        "\tnode%p [label = \"%s\"];\n"
        "\tnode%p -> node%p [label=\"%s\"];\n",
        (const void *)node, node->isword ? "word" : "not",
        (const void *)parent, (const void *)node, node->str);
    for(struct node *const*i = node->child,
        *const*end = i + sizeof node->child / sizeof *node->child;
        i < end; i++) if(*i) graph_logic(*i, node, fp);
}
// graphs a trie in GraphViz format: https://graphviz.org/
static void graph(const struct trie *const trie, const char *const fn) {
    FILE *fp;
    assert(trie && fn);
    if(!(fp = fopen(fn, "w"))) { perror(fn); return; }
    fprintf(fp, "digraph {\n"
        "\tgraph [truecolor=true, bgcolor=transparent, fontname=modern];\n"
        "\tnode [shape=none, fontname=modern];\n"
        "\tnode%p [label=\"root\"];\n", (void *)0);
    if(trie->root) graph_logic(trie->root, 0, fp);
    fprintf(fp, "}\n");
    fclose(fp);
}

static int InsertWord(struct trie *t, char *str){
    // The tree is empty.
    if(t->root == NULL){
        struct node *r;
        const size_t len = strlen(str);
        if(!(r = malloc(sizeof t->root + len + 1))) return 0; // FAM
        *r = (struct node){0};
        r->isword = 1;
        memcpy(r->str, str, len + 1);
        t->root = r;
        printf("word %s inserted as the first word\n", str);
        return 1;
    }
    assert(0); // continue this function . . .
    /* (For each edge.)
        (For each character, check that it's a match for the edge, if it's
        not, create two nodes from the split. Free the original.)
        (Or if it's the empty string, the word is already there.)
        (May be helpful to port `getFirstMismatchLetter`.) */
    return 0;
}
static int ValidateWord(struct trie *t, char *str) {
    int i;
    assert(t && str);
    printf("ValidateWord checkpoint: %s\n", str);
    // simplified
    for (i = 0; str[i] != '\0'; i++) if(!islower(str[i])) return 0;
    return InsertWord(t, str);
}

// this inputs https://en.wikipedia.org/wiki/Radix_tree
static int test(const char *const fn) {
    FILE *fp = 0;
    int success = 1;
    if(!(fp = fopen(fn, "w"))) goto catch;
    fprintf(fp,
        "romane\n"
        "romanus\n"
        "romulus\n"
        "rubens\n"
        "ruber\n"
        "rubicon\n"
        "rubicundus\n");
    fclose(fp), fp = 0;
    if(!freopen(fn, "r", stdin)) goto catch;
    goto finally;
catch:
    success = 0;
    perror(fn);
finally:
    if(fp) fclose(fp), fp = 0;
    return success;
}

int main(void) {
    char str[100];
    struct trie t = {0};
    struct { int no; char fn[100]; } debug = { 0, { 0 } };

    // ideally, tests should be automatic
    if(!test("trie.txt")) exit(EXIT_FAILURE);

    while(fgets(str, sizeof(str), stdin)) {
        str[strcspn(str, "\n")] = '\0';
        // Print a graph so we can see what's going on.
        sprintf(debug.fn, "trie%d.gv", debug.no++), graph(&t, debug.fn);
        int is_insert = ValidateWord(&t, str);
        assert(is_insert);
    }
    // memory leak: all the nodes need to be freed from the bottom.
    free(t.root);
    return 0;
}

With one word.

关于c - 在帕特里夏/基数树上插入单词时遇到问题,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/76543280/

相关文章:

c - 打印值,C

c - 如何在 c 中关闭 #elif 语句?

powershell - 如果文件不存在,则使用PowerShell将文件移动到上一级的远程位置

haskell - 有没有一种优雅的方式让函数返回相同类型的函数(在元组中)

在运行时更改库加载顺序(如 LD_PRELOAD 但在执行期间)

c - 读取文件中的意外值

c# - 从继承类获取特定方法

Bash 根据 split 命令计算字母后缀(即带有字母的 26 进制整数)

c - IPv4 地址的 inet_aton 规范化

java - 为什么这个方法会导致无限递归调用?