Reputation: 9202
I am trying to learn hadoop.
I have the following file downloaded from free large data set websites. I made it short for my sample testing. This is the small file.
"CAMIS","DBA","BORO","BUILDING","STREET","ZIPCODE","PHONE","CUISINECODE","INSPDATE","ACTION","VIOLCODE","SCORE","CURRENTGRADE","GRADEDATE","RECORDDATE"
"40280083","INTERCONTINENTAL THE BARCLAY","1","111 ","EAST 48 STREET ","10017","2129063134","03","2014-02-07 00:00:00","D","10F","4","A","2014-02-07 00:00:00","2014-04-24 06:01:04.920000000"
"40356649","REGINA CATERERS","3","6409","11 AVENUE","11219","7182560829","03","2013-07-30 00:00:00","D","08A","12","A","2013-07-30 00:00:00","2014-04-24 06:01:04.920000000"
"40356649","REGINA CATERERS","3","6409","11 AVENUE","11219","7182560829","03","2013-07-30 00:00:00","D","08B","12","A","2013-07-30 00:00:00","2014-04-24 06:01:04.920000000"
"40356731","TASTE THE TROPICS ICE CREAM","3","1839 ","NOSTRAND AVENUE ","11226","7188560821","43","2013-07-10 00:00:00","D","06C","8","A","2013-07-10 00:00:00","2014-04-24 06:01:04.920000000"
"40356731","TASTE THE TROPICS ICE CREAM","3","1839 ","NOSTRAND AVENUE ","11226","7188560821","43","2013-07-10 00:00:00","D","10B","8","A","2013-07-10 00:00:00","2014-04-24 06:01:04.920000000"
"40357217","WILD ASIA","2","2300","SOUTHERN BOULEVARD","10460","7182207846","03","2013-06-19 00:00:00","D","10B","4","A","2013-06-19 00:00:00","2014-04-24 06:01:04.920000000"
"40360045","SEUDA FOODS","3","705 ","KINGS HIGHWAY ","11223","7183751500","50","2013-10-10 00:00:00","D","08C","13","A","2013-10-10 00:00:00","2014-04-24 06:01:04.920000000"
"40361521","GLORIOUS FOOD","1","522","EAST 74 STREET","10021","2127372140","03","2013-12-19 00:00:00","U","08A","16","B","2013-12-19 00:00:00","2014-04-24 06:01:04.920000000"
"40362098","HARRIET'S KITCHEN","1","502","AMSTERDAM AVENUE","10024","2127210045","18","2014-03-04 00:00:00","U","10F","13","A","2014-03-04 00:00:00","2014-04-24 06:01:04.920000000"
"40361322","CARVEL ICE CREAM","4","265-15 ","HILLSIDE AVENUE ","11004","7183430392","43","2013-09-18 00:00:00","D","08A","10","A","2013-09-18 00:00:00","2014-04-24 06:01:04.920000000"
"40361708","BULLY'S DELI","1","759 ","BROADWAY ","10003","2122549755","27","2014-01-21 00:00:00","D","10F","12","A","2014-01-21 00:00:00","2014-04-24 06:01:04.920000000"
"40362098","HARRIET'S KITCHEN","1","502","AMSTERDAM AVENUE","10024","2127210045","18","2014-03-04 00:00:00","U","04N","13","A","2014-03-04 00:00:00","2014-04-24 06:01:04.920000000"
"40362274","ANGELIKA FILM CENTER","1","18","WEST HOUSTON STREET","10012","2129952570","03","2014-04-03 00:00:00","D","06D","9","A","2014-04-03 00:00:00","2014-04-24 06:01:04.920000000"
"40362715","THE COUNTRY CAFE","1","60","WALL STREET","10005","3474279132","83","2013-09-18 00:00:00","D","10B","13","A","2013-09-18 00:00:00","2014-04-24 06:01:04.920000000"
"40362869","SHASHEMENE INT'L RESTAURA","3","195","EAST 56 STREET","11203","3474300871","17","2013-05-08 00:00:00","D","10B","7","A","2013-05-08 00:00:00","2014-04-24 06:01:04.920000000"
"40363021","DOWNTOWN DELI","1","107","CHURCH STREET","10007","2122332911","03","2014-02-26 00:00:00","D","10B","9","A","2014-02-26 00:00:00","2014-04-24 06:01:04.920000000"
"40362432","HO MEI RESTAURANT","4","103-05","37 AVENUE","11368","7187796903","20","2014-04-21 00:00:00","D","06C","10","A","2014-04-21 00:00:00","2014-04-24 06:01:04.920000000"
"40362869","SHASHEMENE INT'L RESTAURA","3","195","EAST 56 STREET","11203","3474300871","17","2013-05-08 00:00:00","D","10F","7","A","2013-05-08 00:00:00","2014-04-24 06:01:04.920000000"
"40363117","MEJLANDER & MULGANNON","3","7615","5 AVENUE","11209","7182386666","03","2013-10-24 00:00:00","D","02G","11","A","2013-10-24 00:00:00","2014-04-24 06:01:04.920000000"
"40363289","HAPPY GARDEN","2","1236 ","238 SPOFFORD AVE ","10474","7186171818","20","2013-12-30 00:00:00","D","10F","8","A","2013-12-30 00:00:00","2014-04-24 06:01:04.920000000"
"40363644","DOMINO'S PIZZA","1","464","3 AVENUE","10016","2125450200","62","2014-03-06 00:00:00","D","08A","11","A","2014-03-06 00:00:00","2014-04-24 06:01:04.920000000"
"30191841","DJ REYNOLDS PUB AND RESTAURANT","1","351 ","WEST 57 STREET ","10019","2122452912","03","2013-07-22 00:00:00","D","10B","11","A","2013-07-22 00:00:00","2014-04-24 06:01:04.920000000"
"40280083","INTERCONTINENTAL THE BARCLAY","1","111 ","EAST 48 STREET ","10017","2129063134","03","2014-02-07 00:00:00","D","10B","4","A","2014-02-07 00:00:00","2014-04-24 06:01:04.920000000"
"40356442","KOSHER ISLAND","5","2206","VICTORY BOULEVARD","10314","7186985800","50","2013-04-04 00:00:00","D","10F","12","A","2013-04-04 00:00:00","2014-04-24 06:01:04.920000000"
"40356483","WILKEN'S FINE FOOD","3","7114 ","AVENUE U ","11234","7184443838","27","2014-01-14 00:00:00","D","10B","10","A","2014-01-14 00:00:00","2014-04-24 06:01:04.920000000"
File is about some inspection in restaurants.
You can see there is CUISINECODE. Values of it ranges from "00" to some value or can be any value. There will be many restaurants have the same CUISINECODE.
I just want to display the number of restaurants in each cusinecode.
This is my MapReducer Program
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class RestaurantInspection {
public static class Map extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
String line = value.toString();
if (line.startsWith("\"CAMIS\",")) {
// Line is the header, ignore it
return;
}
List<String> columns = new ArrayList<String>();
String[] tokens = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
if (tokens.length != 15) {
// Line isn't the correct number of columns or formatted properly
return;
}
for(String t : tokens) {
columns.add(t.replaceAll("\"", ""));
}
int cusineCode = Integer.parseInt(columns.get(7));
String violations = columns.get(9) + " --- " + columns.get(10);
value.set(violations);
output.collect(value, new IntWritable(cusineCode));
}
}
public static class Reduce extends MapReduceBase implements
Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(RestaurantInspection.class);
conf.setJobName("Restaurent Inspection");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
I am using hadoop 1.2.1. I copied the above code from WordCount Example and just changed few lines.
When I run the above code in hadoop I am getting following lines for the same file I given above
D --- 02G 3
D --- 06C 63
D --- 06D 3
D --- 08A 108
D --- 08B 3
D --- 08C 50
D --- 10B 182
D --- 10F 117
U --- 04N 18
U --- 08A 3
U --- 10F 18
That was just a test. I am not getting any logic of how to write the code to get the desired output. I am expecting the following output for the above file.
01 -- 1
03 -- 9
43 -- 3
50 -- 2
18 -- 2
27 -- 2
83 -- 1
17 -- 2
20 -- 2
62 -- 1
By this, I think I can learn hadoop and map reduce.
So how to write the code? Thanks.
Upvotes: 0
Views: 377
Reputation: 373
You need key to be CUISINECODE.
String cusineCode = columns.get(7);
output.collect(new Text(cusineCode), new IntWritable(1));
This will do the job for you.
Upvotes: 1