Reputation: 39
I am beginner to hadoop. I want to understand the flow of the map reduce function. I am a bit confused that how do I give input to map job through the code and not by any file. How should I configure it. Kindly assist me. Here is my code.
public class TemperatureMapper : MapperBase
{
private static int MISSING = 9999;
public override void Map(string line, MapperContext context)
{
//Extract the namespace declarations in the Csharp files
string year = line.Substring(15, 4);
int startIndex = line[87] == '+'?88 : 87;
int airTemp = int.Parse(line.Substring(startIndex, 92 - startIndex));
string quality = line.Substring(92, 1);
Regex r = new Regex(quality, RegexOptions.IgnoreCase);
Match m = r.Match("[01459]");
if (airTemp != MISSING && r.Match("[01459]").Success)
{
context.EmitKeyValue(year.ToString(), airTemp.ToString());
}
}
}
//Reducer
public class TempReducer : ReducerCombinerBase
{
//Accepts each key and count the occurrances
public override void Reduce(string key, IEnumerable<string> values,ReducerCombinerContext context)
{
//Write back
int maxvalue = int.MinValue;
foreach(string value in values)
{
maxvalue = Math.Max(maxvalue, int.Parse(value));
}
context.EmitKeyValue(key, maxvalue.ToString());
}
}
static void Main(string[] args)
{
try
{
string line;
StreamReader file = new StreamReader("temp.txt");
ArrayList al = new ArrayList();
while ((line = file.ReadLine()) != null)
{
al.Add(line);
}
file.Close();
string[] input = (string[])al.ToArray(typeof(string));
Environment.SetEnvironmentVariable("HADOOP_HOME", @"c:\hadoop");
Environment.SetEnvironmentVariable("Java_HOME", @"c:\hadoop\jvm");
var output = StreamingUnit.Execute<TemperatureMapper, TempReducer>(input);//this code is executed successfully
//runnning the job in azure
var hadoop = Hadoop.Connect(); // connected to hadoop successfully
var config = new HadoopJobConfiguration();
hadoop.MapReduceJob.Execute<TemperatureMapper, TempReducer>(config);//how to I provide input here...
Console.ReadLine();
}
I get the correct result through streaming unit. Now, I want to execute this job in the azure. So how do I give input through the code and not by files? I have given the input through config i.e.
config.AdditionalStreamingArguments.AddRange(input); //input is array of string
but when I execute the job this exception occurs:
The argument must not be empty string.
Parameter name: blobName
Upvotes: 2
Views: 373
Reputation: 4455
Hadoop MapReduce will only operate on files by default (you could write an Storage Handler that was not file based, but that's not very common).
If you are trying to apply MapReduce on something which is generated on a stream, rather than something that exists in files on HDFS you might like to look into something like Storm on YARN instead.
Upvotes: 3