user3605176
user3605176

Reputation: 39

Give input to hadoop job through code

I am beginner to hadoop. I want to understand the flow of the map reduce function. I am a bit confused that how do I give input to map job through the code and not by any file. How should I configure it. Kindly assist me. Here is my code.

    public class TemperatureMapper : MapperBase
        {
 private static int MISSING = 9999;

            public override void Map(string line, MapperContext context)
            {

                //Extract the namespace declarations in the Csharp files
                string year = line.Substring(15, 4);
                int startIndex = line[87] ==  '+'?88 : 87;

                int airTemp = int.Parse(line.Substring(startIndex, 92 - startIndex));

                string quality = line.Substring(92, 1);

                Regex r = new Regex(quality, RegexOptions.IgnoreCase);
                Match m = r.Match("[01459]");
                if (airTemp != MISSING && r.Match("[01459]").Success)
                {
                    context.EmitKeyValue(year.ToString(), airTemp.ToString());
                }
            }
        }



        //Reducer

        public class TempReducer : ReducerCombinerBase
        {
            //Accepts each key and count the occurrances
            public override void Reduce(string key, IEnumerable<string> values,ReducerCombinerContext context)
            {
                //Write back

                int maxvalue = int.MinValue;
                foreach(string value in values)
                {
                    maxvalue = Math.Max(maxvalue, int.Parse(value));
                }
                context.EmitKeyValue(key, maxvalue.ToString());
            }

        }



static void Main(string[] args)
        {

            try
            {

                string line;
                StreamReader file = new StreamReader("temp.txt");

                ArrayList al = new ArrayList();
                while ((line = file.ReadLine()) != null)
                {
                    al.Add(line);
                }
                file.Close();

                string[] input = (string[])al.ToArray(typeof(string));




                Environment.SetEnvironmentVariable("HADOOP_HOME", @"c:\hadoop");
                Environment.SetEnvironmentVariable("Java_HOME", @"c:\hadoop\jvm");

        var output = StreamingUnit.Execute<TemperatureMapper, TempReducer>(input);//this code is executed successfully


//runnning the job in azure
var hadoop = Hadoop.Connect(); // connected to hadoop successfully
                var config = new HadoopJobConfiguration();
                hadoop.MapReduceJob.Execute<TemperatureMapper, TempReducer>(config);//how to I provide input here... 
                Console.ReadLine();             

            }

I get the correct result through streaming unit. Now, I want to execute this job in the azure. So how do I give input through the code and not by files? I have given the input through config i.e.

config.AdditionalStreamingArguments.AddRange(input); //input is array of string

but when I execute the job this exception occurs:

The argument must not be empty string.
Parameter name: blobName

Upvotes: 2

Views: 373

Answers (1)

Simon Elliston Ball
Simon Elliston Ball

Reputation: 4455

Hadoop MapReduce will only operate on files by default (you could write an Storage Handler that was not file based, but that's not very common).

If you are trying to apply MapReduce on something which is generated on a stream, rather than something that exists in files on HDFS you might like to look into something like Storm on YARN instead.

Upvotes: 3

Related Questions