Reputation: 45692
I have a text file of next format: each line starts with a string which is followed by sequence of numbers. Each line has unknown length (unknown amount of numbers, amount from 0 to 1000).
string_1 3 90 12 0 3
string_2 49 0 12 94 13 8 38 1 95 3
.......
string_n 9 43
Afterwards I must handle each line with handleLine
method which accept two arguments: string name and numbers set (see code below).
How to read the file and handle each line with handleLine
efficiently?
My workaround:
Files.lines
. Is it blocking?I think it's pretty uneffective due 2nd and 3rd steps. 1st step mean that java convert file bytes to string first and then in 2nd and 3rd steps I convert them back to String
/Set<Integer>
. Does that influence performance a lot? If yes - how to do better?
public handleFile(String filePath) {
try (Stream<String> stream = Files.lines(Paths.get(filePath))) {
stream.forEach(this::indexLine);
} catch (IOException e) {
e.printStackTrace();
}
}
private void handleLine(String line) {
List<String> resultList = this.parse(line);
String string_i = resultList.remove(0);
Set<Integer> numbers = resultList.stream().map(Integer::valueOf).collect(Collectors.toSet());
handleLine(string_i, numbers); // Here is te final computation which must to be done only with string_i & numbers arguments
}
private List<String> parse(String str) {
List<String> output = new LinkedList<String>();
Matcher match = Pattern.compile("[0-9]+|[a-z]+|[A-Z]+").matcher(str);
while (match.find()) {
output.add(match.group());
}
return output;
}
Upvotes: 1
Views: 2195
Reputation: 526
I set out to test several ways to go about this problem and measure the performance as best I could under noted conditions. Here's what I tested and how I tested it, along with the accompanying results:
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.Scanner;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public class App {
public static void method1(String testFile) {
List<Integer> nums = null;
try (Scanner s = new Scanner(Paths.get(testFile))) {
while (s.hasNext()) {
if (s.hasNextInt())
nums.add(s.nextInt());
else {
nums = new ArrayList<Integer>();
String pre = s.next();
// handleLine( s.next() ... nums ... );
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void method2(String testFile) {
List<Integer> nums = null;
try (BufferedReader in = new BufferedReader(new FileReader(testFile));
Scanner s = new Scanner(in)) {
while (s.hasNext()) {
if (s.hasNextInt())
nums.add(s.nextInt());
else {
nums = new ArrayList<Integer>();
String pre = s.next();
// handleLine( s.next() ... nums ... );
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void method3(String testFile) {
List<Integer> nums = null;
try (BufferedReader br = new BufferedReader(new FileReader(testFile))) {
String line = null;
while ((line = br.readLine()) != null) {
String[] arr = line.split(" ");
nums = new ArrayList<Integer>();
for (int i = 1; i < arr.length; ++i)
nums.add(Integer.valueOf(arr[i]));
// handleLine( ... );
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void method3_1(String testFile) {
List<Integer> nums = null;
try (BufferedReader br = new BufferedReader(new FileReader(testFile))) {
String line = null;
while ((line = br.readLine()) != null) {
String[] arr = line.split(" ");
nums = new ArrayList<Integer>();
for (int i = 1; i < arr.length; ++i)
nums.add(Integer.parseInt(arr[i]));
// handleLine( ... );
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void method4(String testFile) {
List<Integer> nums = null;
try {
List<String> lines = Files.readAllLines(Paths.get(testFile));
for (String s : lines) {
String[] arr = s.split(" ");
nums = new ArrayList<Integer>();
for (int i = 1; i < arr.length; ++i)
nums.add(Integer.valueOf(arr[i]));
// handleLine( ... );
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void method4_1(String testFile) {
List<Integer> nums = null;
try {
List<String> lines = Files.readAllLines(Paths.get(testFile));
for (String s : lines) {
String[] arr = s.split(" ");
nums = new ArrayList<Integer>();
for (int i = 1; i < arr.length; ++i)
nums.add(Integer.parseInt(arr[i]));
// handleLine( ... );
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void method5(String testFile) {
List<Integer> nums = null;
try (BufferedReader br = Files.newBufferedReader(Paths.get(testFile))) {
List<String> lines = br.lines().collect(Collectors.toList());
for (String s : lines) {
String[] arr = s.split(" ");
nums = new ArrayList<Integer>();
for (int i = 1; i < arr.length; ++i)
nums.add(Integer.valueOf(arr[i]));
// handleLine( ... );
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void method5_1(String testFile) {
List<Integer> nums = null;
try (BufferedReader br = Files.newBufferedReader(Paths.get(testFile))) {
List<String> lines = br.lines().collect(Collectors.toList());
for (String s : lines) {
String[] arr = s.split(" ");
nums = new ArrayList<Integer>();
for (int i = 1; i < arr.length; ++i)
nums.add(Integer.parseInt(arr[i]));
// handleLine( ... );
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void method6(String testFile) {
List<Integer> nums = new LinkedList<Integer>();
try (Stream<String> stream = Files.lines(Paths.get(testFile))) {
stream.forEach(line -> {
String[] split = line.split("\\b"); // split with blank seperator
Set<String> numbers = IntStream.range(1, split.length)
.mapToObj(index -> split[index])
.filter(str -> str.matches("\\d+")) // filter numbers
.collect(Collectors.toSet());
numbers.forEach((k) -> nums.add(Integer.parseInt(k)));
// handleLine( ... );
});
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
args = new String[] { "C:\\Users\\Nick\\Desktop\\test.txt" };
Random r = new Random();
System.out.println("warming up a little...");
for (int i = 0; i < 100000; ++i) {
int x = r.nextInt();
}
long s1 = System.currentTimeMillis();
for (int i = 0; i < 10000; ++i)
method1(args[0]);
long e1 = System.currentTimeMillis();
long s2 = System.currentTimeMillis();
for (int i = 0; i < 10000; ++i)
method2(args[0]);
long e2 = System.currentTimeMillis();
long s3 = System.currentTimeMillis();
for (int i = 0; i < 10000; ++i)
method3(args[0]);
long e3 = System.currentTimeMillis();
long s3_1 = System.currentTimeMillis();
for (int i = 0; i < 10000; ++i)
method3_1(args[0]);
long e3_1 = System.currentTimeMillis();
long s4 = System.currentTimeMillis();
for (int i = 0; i < 10000; ++i)
method4(args[0]);
long e4 = System.currentTimeMillis();
long s4_1 = System.currentTimeMillis();
for (int i = 0; i < 10000; ++i)
method4_1(args[0]);
long e4_1 = System.currentTimeMillis();
long s5 = System.currentTimeMillis();
for (int i = 0; i < 10000; ++i)
method5(args[0]);
long e5 = System.currentTimeMillis();
long s5_1 = System.currentTimeMillis();
for (int i = 0; i < 10000; ++i)
method5_1(args[0]);
long e5_1 = System.currentTimeMillis();
long s6 = System.currentTimeMillis();
for (int i = 0; i < 10000; ++i)
method6(args[0]);
long e6 = System.currentTimeMillis();
System.out.println("method 1 = " + (e1 - s1) + " ms");
System.out.println("method 2 = " + (e2 - s2) + " ms");
System.out.println("method 3 = " + (e3 - s3) + " ms");
System.out.println("method 3_1 = " + (e3_1 - s3_1) + " ms");
System.out.println("method 4 = " + (e4 - s4) + " ms");
System.out.println("method 4_1 = " + (e4_1 - s4_1) + " ms");
System.out.println("method 5 = " + (e5 - s5) + " ms");
System.out.println("method 5_1 = " + (e5_1 - s5_1) + " ms");
System.out.println("method 6 = " + (e6 - s6) + " ms");
}
}
Result output:
warming up a little...
method 1 = 1103 ms
method 2 = 872 ms
method 3 = 440 ms
method 3_1 = 418 ms
method 4 = 413 ms
method 4_1 = 376 ms
method 5 = 439 ms
method 5_1 = 384 ms
method 6 = 646 ms
To my understanding, the best approach out of the sample I tested was using Files.readAllLines
, s.split(" ")
, and Integer.parseInt
. Those three combinations produced the apparently fastest again, out of the sample I created and tested with At least maybe you'd change to the Integer.parseInt
to help somewhat.
Note I used sources to help gain some sought after approaches and applied them to this problem/example. E.g. this blog post, this tutorial, and this awesome dude @Peter-Lawrey. Also, further improvements can always be made!
Also, the test.txt file:
my_name 15 00 29 101 1234
cool_id 11 00 01 10 010101
longer_id_name 1234
dynamic_er 1 2 3 4 5 6 7 8 9 10 11 12 123 1456 15689 555555555
(note: performance may greatly vary depending on file size!)
Upvotes: 1
Reputation: 129
Here is your code to parse line into name and numbers
stream.forEach(line -> {
String[] split = line.split("\\b"); //split with blank seperator
Set<String> numbers = IntStream.range(1, split.length)
.mapToObj(index -> split[index])
.filter(str -> str.matches("\\d+")) //filter numbers
.collect(Collectors.toSet());
handleLine(split[0], numbers);
});
Or another way
Map<Boolean, List<String>> collect = Pattern.compile("\\b")
.splitAsStream(line)
.filter(str -> !str.matches("\\b"))
.collect(Collectors.groupingBy(str -> str.matches("\\d+")));
handleLine(collect.get(Boolean.FALSE).get(0), collect.get(Boolean.TRUE));
Upvotes: 1
Reputation: 679
Regarding your first question, it depends on how you reference the Stream
. Streams
are inherently lazy, and don't do work if you're not going to use it. For example, the call to Files.lines
doesn't actually read the file until you add a terminal operation on the Stream
.
From the java doc:
Read all lines from a file as a Stream. Unlike readAllLines, this method does not read all lines into a List, but instead populates lazily as the stream is consumed
The forEach(Consumer<T>)
call is a terminal operation, and, at that point, the lines of the file are read one by one and passed to your indexLine
method.
Regarding your other comments, you don't really have a question here. What are you trying to measure/minmize? Just because something is multiple steps doesn't inherently make it have poor performance. Even if you created a wizbang oneliner to convert from the File
bytes directly to your String
& Set
, you probably just did the intermediate mapping anonymously, or you've called something that will cause the compiler to do that anyway.
Upvotes: 3