ayush nigam
ayush nigam

Reputation: 177

Hdfs file line count

Is there a way to count lines of HDFS directory in JAVA as we do by following command on command prompt?

hadoop fs -cat  /abc/def/* | wc -l

Particularly using HADOOP API instead of writing map-reduce or spark code.

Upvotes: 0

Views: 2544

Answers (1)

Max08
Max08

Reputation: 1025

Something like this should work :-

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class LineCounter {

    public static void main(String[] args) throws IOException {
        // TODO Auto-generated method stub

        Configuration conf = new Configuration();
        conf.addResource(new FileInputStream("hdfs-site.xml"));
        conf.addResource(new FileInputStream("core-site.xml"));

        conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
        conf.set("fs.file.impl",org.apache.hadoop.fs.LocalFileSystem.class.getName());

        FileSystem fs = FileSystem.get(conf);
        Path pt = new Path("/some/path");

        FileStatus[] status = fs.listStatus(pt);

        int count = 0;

        for(FileStatus f : status){
            if(f.isFile()){
                 FSDataInputStream inputStream = fs.open(f.getPath());
                 BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));

                 String line = reader.readLine();

                 while(line!=null){
                     count++;
                     line = reader.readLine();
                 }

                 if(reader!=null){
                     reader.close();
                 }
            }
        }

    }

}

Upvotes: 4

Related Questions