java - 文本中的常用词

标签 java hadoop

我正在尝试使用Hadoop找出文本中最常见的词。 Hadoop是一个框架,可用于跨计算机群集对大数据集进行分布式处理。

我知道可以使用Unix命令job: sort -n -k2 txtname | tail轻松完成此操作。但这并不能扩展到大型数据集。因此,我试图解决问题,然后合并结果。

这是我的WordCount类:

    import java.util.Arrays;
    import org.apache.commons.lang.StringUtils;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

    public class WordCount {
      public static void runJob(String[] input, String output) throws Exception {
        Configuration conf = new Configuration();
        Job job = new Job(conf);
        job.setJarByClass(WordCount.class);

        job.setMapperClass(TokenizerMapper.class);
        job.setReducerClass(IntSumReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        Path outputPath = new Path(output);
        FileInputFormat.setInputPaths(job, StringUtils.join(input, ","));
        FileOutputFormat.setOutputPath(job, outputPath);
        outputPath.getFileSystem(conf).delete(outputPath,true);
        job.waitForCompletion(true);
      }

      public static void main(String[] args) throws Exception {
        runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]);
      }
    }

我知道我需要做更多工作才能与map减少字数统计类别并行工作。

这是我的TokenizerMapper类:
    import java.io.IOException;
    import java.util.StringTokenizer;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;

    public class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { 
      private final IntWritable one = new IntWritable(1);
      private Text data = new Text();

      public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
        StringTokenizer itr = new StringTokenizer(value.toString(), "-- \t\n\r\f,.:;?![]'\"");

        while (itr.hasMoreTokens())  {
          data.set(itr.nextToken().toLowerCase());
          context.write(data, one);
        }
      }
    }

这是我的IntSumReducer类:
    import java.io.IOException;
    import java.util.Iterator;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;

    public class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
      private IntWritable result = new IntWritable();
      public void reduce(Text key, Iterable<IntWritable> values, Context context)

      throws IOException, InterruptedException {
        int sum = 0;

        for (IntWritable value : values) {
          // TODO: complete code here
          sum+=value.get();
        }

        result.set(sum);

        // TODO: complete code here

        if (sum>3) {
          context.write(key,result);
        }
      }
    }

我需要做的是定义另一个映射并减少将与此当前映射并行工作的类。出现次数最多的单词将出现,这是到目前为止我减少类的内容:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;

public class reducer2 extends Reducer<Text, IntWritable, Text, IntWritable> {
  int max_sum =0;
  Text max_occured_key = new Text();

  private IntWritable result = new IntWritable();

  public void reduce(Text key, Iterable<IntWritable> values, Context context)

  throws IOException, InterruptedException {
    int sum = 0;

    for (IntWritable value : values) {
      // TODO: complete code here
      sum+=value.get();
    }

    if (sum >max_sum) {
      max_sum = sum;
      max_occured_key.set(key);
    }

    context.write(max_occured_key, new IntWritable(max_sum));

    //result.set(sum);

    // TODO: complete code here

    /*
    if (sum>3) {
      context.write(key,result);
    }
    */
  }

  protected void cleanup(Context context) throws IOException, InterruptedException {
    context.write(max_occured_key, new IntWritable(max_sum));
  }
}
mapper2的代码:
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;

public class mapper2 {
  private final IntWritable one = new IntWritable(1);
  private Text data = new Text();

  public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    StringTokenizer itr = new StringTokenizer(value.toString(), "-- \t\n\r\f,.:;?![]'\"");
    int count =0;

    while (itr.hasMoreTokens()) {    
      //data.set(itr.nextToken().toLowerCase());          
      context.write(data, one);
    }
  }
}

我还编辑了WordCount类,以便可以同时运行两个作业:
import java.util.Arrays;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {
  public static void runJob(String[] input, String output) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf);
    job.setJarByClass(WordCount.class);

    job.setMapperClass(TokenizerMapper.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    Path outputPath = new Path(output);
    FileInputFormat.setInputPaths(job, StringUtils.join(input, ","));
    FileOutputFormat.setOutputPath(job, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath,true);
    job.waitForCompletion(true);

    Job job2 = new Job(conf);
    job2.setJarByClass(WordCount.class);

    job2.setMapperClass(TokenizerMapper.class);
    job2.setReducerClass(reducer2.class);
    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(IntWritable.class);

    Path outputPath2 = new Path(output);
    FileInputFormat.setInputPaths(job, StringUtils.join(input, ","));
    FileOutputFormat.setOutputPath(job, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath,true);
    job.waitForCompletion(true);
  }

  public static void main(String[] args) throws Exception {
       runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]);
  }
}

如何使用hadoop找出文本中最常见的单词?

最佳答案

这是规范的字数统计问题,您可以在google上找到基本字数统计的任何解决方案。然后,您只需再执行一步:返回计数最大的单词。

怎么做?

如果数据量不太大,您可以负担得起使用单个化简器,则将化简器的数量设置为1。在化简中,保留一个局部变量,该变量记住哪个组(即单词)具有/具有最高计数(s)。然后将该结果写入HDFS中的文件。

如果数据量无法使用单个化简器,那么除了上面提到的第一个步骤之外,您还需要执行额外的步骤:您需要在所有化简器中找到最高计数。您可以通过全局计数器或通过将每个最大单词数写入hdfs中它们自己的文件(小文件)中,然后执行一个后处理步骤(可能是linux脚本)来解析并获得最大值的方法。另外,您可能还需要另一个 map /归约工作来找到它-但这对于那种小/简单的操作来说有点过大。

关于java - 文本中的常用词,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/21332787/

相关文章:

java - 在 Eclipse 中安装新软件时出错

java - 如何为 ProcessBuilder 进程设置 .bat 环境变量

java - 如何为 kafka-mirror-maker 创建自定义序列化程序?

hadoop - 如何将数据插入 Hive 中的 Parquet 表

java - Spring Boot YARN 不运行在 Hadoop 2.8.0 客户端无法访问 DataNode

java - 如何检查哪个RecyclerView被点击

java - 我在println中不断收到 “error:Cannot find symbol”

hadoop - 知道hadoop中数据节点的磁盘空间吗?

hadoop - 没有类名的 ClassNotFoundException

hadoop - Windows 上的 Apache Pig 设置错误