java - 文本中的常用词

我正在尝试使用Hadoop找出文本中最常见的词。 Hadoop是一个框架，可用于跨计算机群集对大数据集进行分布式处理。

我知道可以使用Unix命令job: sort -n -k2 txtname | tail轻松完成此操作。但这并不能扩展到大型数据集。因此，我试图解决问题，然后合并结果。

这是我的WordCount类:

    import java.util.Arrays;
    import org.apache.commons.lang.StringUtils;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

    public class WordCount {
      public static void runJob(String[] input, String output) throws Exception {
        Configuration conf = new Configuration();
        Job job = new Job(conf);
        job.setJarByClass(WordCount.class);

        job.setMapperClass(TokenizerMapper.class);
        job.setReducerClass(IntSumReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        Path outputPath = new Path(output);
        FileInputFormat.setInputPaths(job, StringUtils.join(input, ","));
        FileOutputFormat.setOutputPath(job, outputPath);
        outputPath.getFileSystem(conf).delete(outputPath,true);
        job.waitForCompletion(true);
      }

      public static void main(String[] args) throws Exception {
        runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]);
      }
    }

我知道我需要做更多工作才能与map减少字数统计类别并行工作。

这是我的TokenizerMapper类:

    import java.io.IOException;
    import java.util.StringTokenizer;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;

    public class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { 
      private final IntWritable one = new IntWritable(1);
      private Text data = new Text();

      public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
        StringTokenizer itr = new StringTokenizer(value.toString(), "-- \t\n\r\f,.:;?![]'\"");

        while (itr.hasMoreTokens())  {
          data.set(itr.nextToken().toLowerCase());
          context.write(data, one);
        }
      }
    }

这是我的IntSumReducer类:

    import java.io.IOException;
    import java.util.Iterator;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;

    public class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
      private IntWritable result = new IntWritable();
      public void reduce(Text key, Iterable<IntWritable> values, Context context)

      throws IOException, InterruptedException {
        int sum = 0;

        for (IntWritable value : values) {
          // TODO: complete code here
          sum+=value.get();
        }

        result.set(sum);

        // TODO: complete code here

        if (sum>3) {
          context.write(key,result);
        }
      }
    }

我需要做的是定义另一个映射并减少将与此当前映射并行工作的类。出现次数最多的单词将出现，这是到目前为止我减少类的内容:

import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;

public class reducer2 extends Reducer<Text, IntWritable, Text, IntWritable> {
  int max_sum =0;
  Text max_occured_key = new Text();

  private IntWritable result = new IntWritable();

  public void reduce(Text key, Iterable<IntWritable> values, Context context)

  throws IOException, InterruptedException {
    int sum = 0;

    for (IntWritable value : values) {
      // TODO: complete code here
      sum+=value.get();
    }

    if (sum >max_sum) {
      max_sum = sum;
      max_occured_key.set(key);
    }

    context.write(max_occured_key, new IntWritable(max_sum));

    //result.set(sum);

    // TODO: complete code here

    /*
    if (sum>3) {
      context.write(key,result);
    }
    */
  }

  protected void cleanup(Context context) throws IOException, InterruptedException {
    context.write(max_occured_key, new IntWritable(max_sum));
  }
}

mapper2的代码:

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;

public class mapper2 {
  private final IntWritable one = new IntWritable(1);
  private Text data = new Text();

  public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    StringTokenizer itr = new StringTokenizer(value.toString(), "-- \t\n\r\f,.:;?![]'\"");
    int count =0;

    while (itr.hasMoreTokens()) {    
      //data.set(itr.nextToken().toLowerCase());          
      context.write(data, one);
    }
  }
}

我还编辑了WordCount类，以便可以同时运行两个作业:

import java.util.Arrays;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {
  public static void runJob(String[] input, String output) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf);
    job.setJarByClass(WordCount.class);

    job.setMapperClass(TokenizerMapper.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    Path outputPath = new Path(output);
    FileInputFormat.setInputPaths(job, StringUtils.join(input, ","));
    FileOutputFormat.setOutputPath(job, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath,true);
    job.waitForCompletion(true);

    Job job2 = new Job(conf);
    job2.setJarByClass(WordCount.class);

    job2.setMapperClass(TokenizerMapper.class);
    job2.setReducerClass(reducer2.class);
    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(IntWritable.class);

    Path outputPath2 = new Path(output);
    FileInputFormat.setInputPaths(job, StringUtils.join(input, ","));
    FileOutputFormat.setOutputPath(job, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath,true);
    job.waitForCompletion(true);
  }

  public static void main(String[] args) throws Exception {
       runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]);
  }
}

如何使用hadoop找出文本中最常见的单词？

最佳答案

这是规范的字数统计问题，您可以在google上找到基本字数统计的任何解决方案。然后，您只需再执行一步:返回计数最大的单词。

怎么做？

如果数据量不太大，您可以负担得起使用单个化简器，则将化简器的数量设置为1。在化简中，保留一个局部变量，该变量记住哪个组(即单词)具有/具有最高计数(s)。然后将该结果写入HDFS中的文件。

如果数据量无法使用单个化简器，那么除了上面提到的第一个步骤之外，您还需要执行额外的步骤:您需要在所有化简器中找到最高计数。您可以通过全局计数器或通过将每个最大单词数写入hdfs中它们自己的文件(小文件)中，然后执行一个后处理步骤(可能是linux脚本)来解析并获得最大值的方法。另外，您可能还需要另一个 map /归约工作来找到它-但这对于那种小/简单的操作来说有点过大。

关于java - 文本中的常用词，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/21332787/

java - 文本中的常用词

上一篇：twitter - 多个水槽Twitter代理

下一篇：hadoop - 如何在Hadoop程序中使用外部Jar文件