c - 打印给定文本文件中最常出现的单词，无法在 C 中按频率排序

我正在做一项作业，要求我打印给定文本文件中出现次数最多的 10 个单词。我的代码正在打印文件中的单词，但它没有根据它们的频率对它们进行排序。

这是我的代码如下。我使用哈希表来存储每个唯一的单词及其频率。我目前正在使用我编写的 wordcmp 函数对单词进行排序，并在 main 中的内置 qsort 函数中调用它。

如果有人可以指导我纠正我的错误，我将非常感激。

我当前的输出:

排名前 10 的单词(共 10 个)是:

1 即时

1 个

又是1

3快乐

2你好

1 如何实现

1 让

1 个你

1 次尝试

1 这个

预期输出(我想要的):

排名前 10 的单词(共 10 个)是:

3快乐

2你好

1 个你

1 次尝试

1 这个

1 让

1 即时

1 如何实现

1 个

又是1

这是我的一些代码:

typedef struct word
{ 
  char *s;          /* the word */
  int count;        /* number of times word occurs */
  struct word* next;
}word;

struct hashtable
{
  word **table;
  int tablesize;
  int currentsize;
};
typedef struct hashtable hashtable;
int main(int argc, char *argv[])
{

    int top_words = 10;
    word *word = NULL;
    hashtable *hash = ht_create(5000);
    char *file_name;
    char *file_word;
    FILE *fp;
    struct word *present = NULL;

    fp = fopen (file_name, "r");
    if (fp == NULL)
    {
        fprintf (stderr,"%s: No such file or directory\n", file_name);
        fprintf(stderr,"The top %d words (out of 0) are:\n", top_words); 
        exit(-1);
    }

    continue_program:
    while ((file_word = getWord(fp)))
    {
        word = add(hash, file_word, 1);
    }
    fclose(fp);

    qsort((void*)hash->table, hash->currentsize, sizeof(word),(int (*)(const void *, const void *)) wordcmp);

    if(top_words > total_unique_words)
          top_words = total_unique_words;

    printf("the top %d words (out of %d) are:\n", top_words, total_unique_words);

    int iterations =0;
    for(i =0; i <= hash->tablesize && iterations< top_words; i++)
    {
          present = hash->table[i];
          if(present != NULL)
          {
              printf("     %4d %s\n", present->count, present->s);
              present = present->next;
              iterations++;
          }
    }
    freetable(hash);

 return 0;
}

int wordcmp (word *a, word *b) 
{
    if (a != NULL && b!= NULL) {

    if (a->count < b->count) 
    {
      return +1;     
    }
    else if (a->count > b->count) 
    {
        return -1; 
    }
    else if (a->count == b->count)
    {
      /*return strcmp(b->s, a->s);*/
      return 0;
    }
  }
  return 0;
}

/* Create a new hashtable. */
struct hashtable *ht_create( int size ) 
{
  int i;

  if( size < 1 ) 
    return NULL;

  hashtable *table = (hashtable *) malloc(sizeof(hashtable));
  table->table = (word **) malloc(sizeof(word *) * size);

  if(table != NULL)
  {
      table->currentsize = 0;
      table->tablesize = size;
  }

  for( i = 0; i < size; i++ ) 
  {
    table->table[i] = NULL;
  }

  return table; 
}

/* Adds a new node to the hash table*/
word * add(hashtable *h, char *key, int freq) 
{
    int index = hashcode(key) % h->tablesize;
    word *current = h->table[index];

    /* Search for duplicate value */
    while(current != NULL) {
        if(contains(h, key) == 1){
            current->count++;
            return current;
       }
         current = current->next;
     }

    /* Create new node if no duplicate is found */
    word *newnode = (struct word*)malloc(sizeof(struct word));
    if(newnode!=NULL){
          newnode->s =strdup(key);
          newnode-> count = freq;
          newnode-> next = NULL;
    }
    h->table[index] = newnode;
    h->currentsize = h->currentsize + 1;
    total_unique_words++;
    return newnode;
}

最佳答案

您面临的主要问题是尝试使用存储桶的链表链接对哈希表进行排序。当发生哈希冲突时，您的表不会调整大小，您只需使用链接列表将导致冲突的单词存储在与已存储在那里的单词链接的同一个 table[index] 中。这就是 add 的作用。

这很容易导致哈希表的内容如下所示:

table[ 0] = NULL
table[ 1] = foo
table[ 2] = NULL
table[ 3] = |some|->|words|->|that|->|collided|  /* chained bucket */
table[ 4] = other
table[ 5] = words
table[ 6] = NULL
table[ 7] = NULL
...

你不能简单地qsort表并希望得到正确的词频。 qsort 无法知道 "some" 只是链表中的开头单词，所有 qsort 获取的都是指向 “一些”和sizeof(word)。

为了让生活变得更轻松，只需忘记哈希表，并使用动态分配的 word** 数组即可。您可以使用类似的添加来增加重复项的出现次数，并避免链式存储桶的所有问题。 (如果您为每个单词提供自动存储，它会给您留下一个简单的free()指针，然后您就完成了)

以下示例采用 2 个参数。第一个是要从中读取单词的文件名，第二个整数值(可选)将排序后的输出限制为最高单词数。 words_t 结构使用自动存储限制为 32 个字符的 word(未删节词典中最大的单词为 28 个字符)。您可以更改单词或阅读的方式来解析输入并根据需要忽略标点符号和复数。以下内容在所有标点符号上分隔单词(连字符除外)，并丢弃单词的复数形式(例如，当遇到 "Mike's" 时，它会存储 "Mike" ，并丢弃“的”)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>

#define MAXC   32   /* max word length is 28-char, 29-char is sufficient */
#define MAXW  128   /* initial maximum number of words to allocate */

typedef struct {
    char word[MAXC];    /* struct holding individual words */
    size_t ninst;       /* and the number of times they occur */
} words_t;

/*  function prototypes */
void *addword (words_t *words, const char *word, size_t *wc, size_t *maxw);
void *xrealloc (void *ptr, size_t psz, size_t *nelem);

/* qsort compare function for words_t (alphabetical) */
int cmpwrds (const void *a, const void *b)
{
    return strcmp (((words_t *)a)->word, ((words_t *)b)->word);
}

/* qsort compare function for words_t (by occurrence - descending)
 * and alphabetical (ascending) if occurrences are equal)
 */
int cmpinst (const void *a, const void *b)
{
    int ndiff =  (((words_t *)a)->ninst < ((words_t *)b)->ninst) - 
                (((words_t *)a)->ninst > ((words_t *)b)->ninst);

    if (ndiff)
        return ndiff;

    return strcmp (((words_t *)a)->word, ((words_t *)b)->word);
}

int main (int argc, char **argv) {

    int c = 0, nc = 0, prev = ' ', total = 0;
    size_t maxw = MAXW, wc = 0, top = 0;
    char buf[MAXC] = "";
    words_t *words = NULL;
    FILE *fp = fopen (argv[1], "r");

    if (!fp) {  /* validate file open for reading */
        fprintf (stderr, "error: file open failed '%s'.\n", argv[1]);
        return 1;
    }

    if (argc > 2) { /* if 2 args, convert argv[2] to number of top words */
        char *p = argv[2];
        size_t tmp = strtoul (argv[2], &p, 0);
        if (p != argv[2] && !errno)
            top = tmp;
    }

    /* allocate/validate initial words */
    if (!(words = calloc (maxw, sizeof *words))) {
        perror ("calloc-words");
        return 1;
    }

    while ((c = fgetc(fp)) != EOF) {        /* read each character in file */
        if (c != '-' && (isspace (c) || ispunct (c))) { /* word-end found */
            if (!isspace (prev) && !ispunct (prev) &&   /* multiple ws/punct */
                !(prev == 's' && nc == 1)) {            /* exclude "'s" */
                buf[nc] = 0;                            /* nul-terminate */
                words = addword (words, buf, &wc, &maxw);   /* add word */
                nc = 0;     /* reset char count */
            }
        }
        else if (nc < MAXC - 1) {   /* add char to buf */
            buf[nc++] = c;
        }
        else {  /* chars exceed MAXC - 1; storage capability of struct */
            fprintf (stderr, "error: characters exceed %d.\n", MAXC);
            return 1;
        }
        prev = c;   /* save previous char */
    }
    if (!isspace (prev) && !ispunct (prev))     /* handle non-POSIX end */
        words = addword (words, buf, &wc, &maxw);

    if (fp != stdin) fclose (fp);   /* close file if not stdin */

    qsort (words, wc, sizeof *words, cmpinst);  /* sort words by frequency */

    printf ("'%s' contained '%zu' words.\n\n",  /* output total No. words */
            fp == stdin ? "stdin" : argv[1], wc);

    /* output top words (or all words in descending order if top not given) */
    for (size_t i = 0; i < (top != 0 ? top : wc); i++) {
        printf ("  %-28s    %5zu\n", words[i].word, words[i].ninst);
        total += words[i].ninst;
    }
    printf ("%33s------\n%34s%5d\n", " ", "Total: ", total);

    free (words);

    return 0;
}

/** add word to words, updating pointer to word-count 'wc' and
 *  the maximum words allocated 'maxw' as needed. returns pointer
 *  to words (which must be assigned back in the caller).
 */
void *addword (words_t *words, const char *word, size_t *wc, size_t *maxw)
{
    size_t i;

    for (i = 0; i < *wc; i++)
        if (strcmp (words[i].word, word) == 0) {
            words[i].ninst++;
            return words;
        }

    if (*wc == *maxw)
        words = xrealloc (words, sizeof *words, maxw);

    strcpy (words[*wc].word, word);
    words[(*wc)++].ninst++;

    return words;
}

/** realloc 'ptr' of 'nelem' of 'psz' to 'nelem * 2' of 'psz'.
 *  returns pointer to reallocated block of memory with new
 *  memory initialized to 0/NULL. return must be assigned to
 *  original pointer in caller.
 */
void *xrealloc (void *ptr, size_t psz, size_t *nelem)
{   void *memptr = realloc ((char *)ptr, *nelem * 2 * psz);
    if (!memptr) {
        perror ("realloc(): virtual memory exhausted.");
        exit (EXIT_FAILURE);
    }   /* zero new memory (optional) */
    memset ((char *)memptr + *nelem * psz, 0, *nelem * psz);
    *nelem *= 2;
    return memptr;
}

(注意:输出按出现次数降序排序，如果单词出现次数相同，则按字母顺序排序)

示例使用/输出

$ ./bin/getchar_wordcnt_top dat/damages.txt 10
'dat/damages.txt' contained '109' words.

  the                                12
  a                                  10
  in                                  7
  of                                  7
  and                                 5
  anguish                             4
  injury                              4
  jury                                4
  mental                              4
  that                                4
                                 ------
                           Total:    61

注意:要使用哈希表作为存储基础，您至少必须创建一个指向哈希表中每个单词的指针数组，然后对指针数组进行排序。否则，您将需要复制存储并将单词复制到新数组中进行排序。 (这在某种程度上是一种内存效率低下的方法)。创建一个单独的指向哈希表中要排序的每个单词的指针数组是调用 qsort 并避免链式存储桶问题的唯一方法。

关于c - 打印给定文本文件中最常出现的单词，无法在 C 中按频率排序，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/50070148/

c - 打印给定文本文件中最常出现的单词，无法在 C 中按频率排序

上一篇：c - 如何获取SSDT地址

下一篇：c - 如何在linux中引发因EINTR而失败的semop调用？