Reputation: 1217

Mutual words in files using hadoop mapreduce

I have been trying to execute some code that would allow me to 'only' list the words that exist in multiple files; what I have done so far was use the wordcount example and thanx to Chris White I managed to compile it. I tried reading here and there to get the code to work but all I am getting is a blank page with no data. the mapper is suppose to collect each word with its corresponding locations; the reducer is suppose to collect the common words any thoughts as to what might be the problem? the code is:

    package org.myorg;

import java.io.IOException;
import java.util.*;
import java.lang.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

public class WordCount {



    public static class Map extends MapReduceBase implements Mapper<Text, Text, Text, Text> 
    {

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

          private Text outvalue=new Text();
          private String filename = null;

        public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException 
        {
        if (filename == null) 
        {
          filename = ((FileSplit) reporter.getInputSplit()).getPath().getName();
        }

        String line = value.toString();
        StringTokenizer tokenizer = new StringTokenizer(line);

        while (tokenizer.hasMoreTokens()) 
        {
          word.set(tokenizer.nextToken());
          outvalue.set(filename);
          output.collect(word, outvalue);
        }

        }
    }



    public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> 
    {


        private Text src = new Text();
        public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException 
        {


        int sum = 0;
        //List<Text> list = new ArrayList<Text>(); 

            while (values.hasNext()) // I believe this would have all locations of the same word in different files?
            {

                sum += values.next().get();
                src =values.next().get();

            }
        output.collect(key, src);
            //while(values.hasNext()) 
            //{ 
                //Text value = values.next(); 
                //list.add(new Text(value)); 
                //System.out.println(value.toString());       
            //} 
            //System.out.println(values.toString()); 
            //for(Text value : list) 
            //{ 
                //System.out.println(value.toString()); 
            //} 


        }

    }



    public static void main(String[] args) throws Exception 
    {

    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("wordcount");
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);
    //conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    JobClient.runJob(conf);

    }

}

Am I missing anything? much obliged... My Hadoop version : 0.20.203

Upvotes: 0

Answers (2)

Chris White

Reputation: 30089

In the reducer, maintain a set of the values observed (the filenames emitted in the mapper), if after you consume all the values, this set size is 1, then the word is only used in one file.

public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> 
{
    private TreeSet<Text> files = new TreeSet<Text>();

    public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException 
    {
        files.clear();

        for (Text file : values)
        {
            if (!files.contains(value))
            {
                // make a copy of value as hadoop re-uses the object
                files.add(new Text(value));
            }
        }

        if (files.size() == 1) {
            output.collect(key, files.first());
        }

        files.clear();
    }
}

Upvotes: 1

Charles Menguy

Reputation: 41428

First of all it seems you're using the old Hadoop API (mapred), and a word of advice would be to use the new Hadoop API (mapreduce) which is compatible with 0.20.203

In the new API, here is a wordcount that will work

import java.io.IOException;
import java.lang.InterruptedException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {
/**
 * The map class of WordCount.
 */
public static class TokenCounterMapper
    extends Mapper<Object, Text, Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
        StringTokenizer itr = new StringTokenizer(value.toString());
        while (itr.hasMoreTokens()) {
            word.set(itr.nextToken());
            context.write(word, one);
        }
    }
}
/**
 * The reducer class of WordCount
 */
public static class TokenCounterReducer
    extends Reducer<Text, IntWritable, Text, IntWritable> {
    public void reduce(Text key, Iterable<IntWritable> values, Context context)
        throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        context.write(key, new IntWritable(sum));
    }
}
/**
 * The main entry point.
 */
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    Job job = new Job(conf, "Example Hadoop 0.20.1 WordCount");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenCounterMapper.class);
    job.setReducerClass(TokenCounterReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Then, we build this file and pack the result into a jar file:

mkdir classes
javac -classpath /path/to/hadoop-0.20.203/hadoop-0.20.203-core.jar:/path/to/hadoop-  0.20.203/lib/commons-cli-1.2.jar -d classes WordCount.java && jar -cvf wordcount.jar -C classes/ .

Finally, we run the jar file in standalone mode of Hadoop

echo "hello world bye world" > /tmp/in/0.txt
echo "hello hadoop goodebye hadoop" > /tmp/in/1.txt
hadoop jar wordcount.jar org.packagename.WordCount /tmp/in /tmp/out

Upvotes: 1

Mutual words in files using hadoop mapreduce

Answers (2)

Related Questions