Reputation: 1525
I have 2 input files for a Hadoop MapRed task.
The input to the program is input.txt
which contains in each row PaperID keyword1 keyword2 FieldID
p20 k j f3
p21 k j f11
p22 k j f3
p23 k j f2
p23 k j f1
The file sammap.txt
used in the Reducer class contains in each row FieldID FieldName
f1 AI
f2 DB
f3 DB
f4 AI
The code is as follows: package dorado;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Dorado {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, Text>{
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String str = value.toString().replaceAll(" +", " ").trim(), fir="", lst="";
if (!str.equals("")) {
fir = str.substring(0, str.indexOf(" "));
lst = str.substring(str.lastIndexOf(" ")+1);
context.write(new Text(fir), new Text(lst));
}
}
}
public static class IntSumReducer
extends Reducer<Text,Text,Text,Text> {
// private IntWritable result = new IntWritable();
private HashMap<Text, Text> fieldToClass = new HashMap<>();
public void reduce(Text key, Iterable <Text> value,Context context) throws IOException, InterruptedException {
FileReader fr = new FileReader("sammap.txt");
BufferedReader br = new BufferedReader(fr);
String str=null;
while ((str = br.readLine()) != null) {
String st[] = str.split(" +");
fieldToClass.put(new Text(st[0].trim()), new Text(st[1].trim()));
//System.out.println("--"+st[0].trim()+"--"+ st[1].trim()+"--");
}
fr.close();
for (Text field : value) {
System.out.println(key + "-->" + field);
if (fieldToClass.containsKey(field))
context.write(key, fieldToClass.get(field));
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "dorado");
job.setJarByClass(Dorado.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
The Hashmap fieldToClass
has the key as FieldID and value as FieldName.
The output on the console for the following code snippet in the Reducer Class:
for (Text field : value) {
System.out.println(key + "-->" + field);
if (fieldToClass.containsKey(field))
context.write(key, fieldToClass.get(field));
}
is this:
p20-->DB
p22-->DB
p23-->AI
p23-->DB
However we would expect the output to be of the form:
p20-->f3
p22-->f3
p23-->f1
p23-->f2
Also there is no output in the final output file of the program. The file is empty.
The correct output we expect in the file is:
p20 DB
p22 DB
p23 DB
p23 AI
Why is the program behaving erroneously? What are the possible solutions?
Upvotes: 0
Views: 54
Reputation: 91
Your whole process can be done inside the mapper itself. Use setup function of mapper to initialize the HashMap. Directly search for the fieldId in the HashMap and get it's value and write it to context. Output the same thing in the reducer inside the for loop of the iterable value.
Upvotes: 1