c++ - 如何使用c函数计算文本文件中的单词数

标签 c++ c

我一直在寻找如何读取文件并计算句子中有多少个单词的示例,我的理解是计算有多少个白色字符。这是我应该采取的方法吗?我确实还有其他问题,但我想一一解决。如果我可以避免使用 main 函数,我想创建一个自己的函数,它与我已经使用的函数类似,但用于单词。

#include<iostream>
#include<cctype>
#include<cstring>
#include<stdio.h>
#include<fstream>
#include<stdlib.h>
#include<iomanip>
#include<string.h>

using namespace std;

//single structure that contains all functions + the line itself
struct Counts
{
  int countChars;
  int countNW;
  int countAlpha;
  int countDigits;
  int countPunctuation;
};

//function names
int countChars(Counts&, char[]);
int countNW(Counts&, char[]);
int countAlpha(Counts&, char[]);
int countDigits(Counts&, char[]);
int countPunctuation(Counts&, char[]);
void printRprt(Counts&);  

int main()
{
  //open input file (C-style)
  FILE *fp;
  fp = fopen("inp1.txt", "r");
  //error message
  if(fp == NULL)
  {
    printf("Could not open input file\n");
    exit(0);
  }

  //declare variables
  Counts counts;
  int i;
  char line[80];
  char delims[] = " ,\t\n"; //delimeters: space, ',', tab, return

  //get lines (max 80) from file
  //a line is a group of characters terminated by a \n
  //Call to the functions
  while(fgets(line, 80, fp))
  {
    printf(line);
    countChars(counts, line);
    countNW(counts, line);
    countAlpha(counts, line);
    countDigits(counts, line);
    countPunctuation(counts, line);
    printRprt(counts);
  }
}

/*
 * Function name: countChars
 * Output: An integer
 * Description: Counts how many characters are on the given line.
*/
int countChars(Counts& counts, char line[])
{
  counts.countChars=0;
  int i=0; 
  while (line[i] !='\0')
  {
   {
   counts.countChars++;
   }
  i++;
  }
}
/*
 * Function name: countNW
 * Output: An integer
 * Description: counts how many characters are not white space.
*/
int countNW(Counts& counts, char line[])
{
  counts.countNW=0;
  char c;
  int i=0;
  while (line[i]!='\0')
  {
    c= line[i];
    if (!isspace(c))
    {
    counts.countNW++;
    }
  i++;
  }
}

/*
 * Function name: countAlpha
 * Output: An integer
 * Description: counts how many characters are alphabetic letters.
*/
int countAlpha(Counts& counts, char line[])
{
  counts.countAlpha=0;
  int i=0;
  while (line[i]!='\0')
  {
    if (isalpha(line[i]))
    {
    counts.countAlpha++;
    }
    i++;
  }
}

/*
 * Function name: countDigits
 * Output: An integer
 * Description: Counts how many characters are numeric digits.
*/
int countDigits(Counts& counts, char line[])
{
  counts.countDigits=0;
  int i;
  i=0;
  while ((line[i]!='\0'))
  {
   if (isdigit(line[i]))
   {
   counts.countDigits++;
   }
  i++;
  }
}

/*
 * Function name: countPunctuation
 * Output: An integer
 * Description: Counts how many characters are punctuation. (non-numeric, non-alphabetical)
*/
int countPunctuation(Counts& counts, char line[])
{
  counts.countPunctuation=0;
  int i=0;
  int cx=0;
  while (line[i]!='\0')
  {
    if (ispunct(line[i]))
    {
    counts.countPunctuation++;
    }
   cx++;
   i++;
  }
}

/*
 * Function name: printReport
 * Output: Results of all the functions.
 * Description: Prints a report of all the functions.
*/
void printRprt(Counts& counts)
{
  cout <<"Total characters: " << counts.countChars <<"\nNon-white space: " << counts.countNW << "\nAlphabetic: " << counts.countAlpha <<"\nDigits: " << counts.countDigits << "\nPunctuation: " << counts.countPunctuation <<endl;
}

根据我的理解,我似乎可以使用非空白功能,但我想知道我可以编辑什么,以便它可以读取单词而不仅仅是每个字符。如果您不明白,请在投反对票之前告诉我。谢谢!

最佳答案

抱歉,我第一次发帖时没有解释,我被叫走了。

如果使用 C++,我会使用标准库中的正则表达式类。下面的代码是一种快速破解,它可以计算文件中感兴趣的项目,并将其分解为文件中的每个句子,其中句子是以一个或多个标点符号“.”、“!”结尾的内容。或者 '?'和一些空白。

这样做的优点是相当灵活并且需要维护的代码很少。

代码可以改进。 main() 应该检查命令行参数,而不是盲目地假设有一个。如果无法打开 if 流,则应该出错。并且可能应该有一个使用说明。每个句子的代码都可以改进,例如在调用 Countem() 之前使用 regex_replace() 删除句子中的换行符。这些都是简单的添加。

学习如何使用标准库中的正则表达式工具的努力将很快为文本处理事件带来返回。他们非常强大。

#include <algorithm>
#include <fstream>
#include <iostream>
#include <regex>
#include <string>

using namespace std;

size_t Countem(const string & s, const string & re)
{
  regex  rgx(re);
  auto   b = sregex_iterator(s.begin(), s.end(), rgx);
  auto   e = sregex_iterator();
  return distance(b, e);
}

void PerSentence(const string & s)
{
  regex  sentence("[^\\!\\.\\?]+[\\!\\.\\?]+[ \t\n]+");
  auto   b = sregex_iterator(s.begin(), s.end(), sentence);
  auto   e = sregex_iterator();
  for ( ; b != e; ++b) {
    cout << "Sentence: " << b->str() << '\n'
         << "    Words: " << Countem(b->str(), "[^ \t\n]+") << '\n'
         << "    Alphas: " << Countem(b->str(), "[[:alpha:]]") << '\n'
         << "    Digits: " << Countem(b->str(), "[[:digit:]]") << '\n'
         << "    Punctuation: " << Countem(b->str(), "[[:punct:]]") << '\n';
  }
  return;
}

int main(int argc, char *argv[])
{
  ifstream  is(argv[1]);
  string    s((istreambuf_iterator<char>(is)),
              istreambuf_iterator<char>());
  cout << "File:\n"
       << "    Words: " << Countem(s, "[^ \t\n]+") << '\n'
       << "    Alphas: " << Countem(s, "[[:alpha:]]") << '\n'
       << "    Digits: " << Countem(s, "[[:digit:]]") << '\n'
       << "    Punctuation: " << Countem(s, "[[:punct:]]") << '\n'
       << "    Sentences: " << Countem(s, "[\\!\\.\\?][ \t\n]+") << "\n\n";

  PerSentence(s);
  return 0;
}

关于c++ - 如何使用c函数计算文本文件中的单词数,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36758481/

相关文章:

c++ - XCB STRING 和 WM_NAME 未定义

c - 为什么 scanf() 在某些情况下需要 & operator (address-of) 而在其他情况下不需要?

c - 在 C 中使用 strstr 方法与两个数组?

c - 为什么使用数据而不是 xdata 会显着减少代码空间

c++ - 串行端口 : ReadFile and CloseHandle

c++ - 如何将我的窗口投影为前台的事件窗口

c++ - AES 加密/解密文本

c++ - 缺少 cudart64_80.dll

c - 这个C代码有什么问题?

c++ - Boost.Locale 是否支持读写 UTF-8 编码的文件?