Reputation: 10788
Minimum reproducible example below, with the dependency:
<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil</artifactId>
<version>8.5.12</version>
</dependency>
The issue with the code below is that it works with short numerical values, and it doesn't with bigger values 😕.
package net.clementlevallois.functions.mapsofscience;
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ReferenceOpenHashSet;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
/**
*
* @author LEVALLOIS
*/
public class JournalSimilaritiesComputer {
static Long2ObjectOpenHashMap<ReferenceOpenHashSet<Long>> journal2AuthorsMap = new Long2ObjectOpenHashMap<>();
public static void main(String[] args) throws IOException {
JournalSimilaritiesComputer computer = new JournalSimilaritiesComputer();
computer.loadDataToMap();
System.out.println("number of entries in the map: " + journal2AuthorsMap.size());
computer.doubleLoopingThroughJournalIds();
}
private void loadDataToMap() throws IOException {
/*
The goal is to:
- load lines
- split them on |
- the left side is an entity, the right side is thing to measure similarity on.
- in the examples below, we would expect 2 lines to be similar because they have the same value on the right side
- but weirdly, the second example fails to detect the similarity. Help!
*/
List<String> lines = List.of("1|0","2|0"); // DETECTION OF SIMILARITY WORKS
lines = List.of("4210168784|3089954349","198793727|3089954349"); // DETECTION OF SIMILARITY FAILS
lines.stream().forEach(line -> {
processLine(line);
});
}
private void doubleLoopingThroughJournalIds() {
journal2AuthorsMap.keySet().longStream().forEach(journalIdA -> {
ReferenceOpenHashSet<Long> authorsOfJournalA = journal2AuthorsMap.get(journalIdA);
journal2AuthorsMap.keySet().longStream().forEach(journalIdB -> {
if (journalIdA == journalIdB) {
return;
}
ReferenceOpenHashSet<Long> authorsOfJournalB = journal2AuthorsMap.get(journalIdB);
Integer similarity = computeSimilarities(authorsOfJournalA, authorsOfJournalB);
if (similarity > 0) {
String sim = journalIdA + "," + journalIdB + "," + similarity;
System.out.println("similarity detected! -> " + sim);
}
});
});
}
private Integer computeSimilarities(ReferenceOpenHashSet<Long> authorsOfJournalA, ReferenceOpenHashSet<Long> authorsOfJournalB) {
for (Long entry: authorsOfJournalA){
if (authorsOfJournalB.contains(entry)){
System.out.println("stop"); // never reached with example with long values
System.out.println("stop");
}
}
Set<Long> commonElements = authorsOfJournalA.parallelStream()
.filter(authorsOfJournalB::contains)
.collect(Collectors.toSet());
return commonElements.size(); // is zero with example with long values, should be equal to 1!
}
private void processLine(String line) {
String fields[] = line.split("\\|");
if (fields.length < 2) {
return;
}
String journalId = fields[0];
long journalIdAsLong = Long.parseLong(journalId);
String authorIdsAsLine = fields[1];
String authorIds[] = authorIdsAsLine.split(",");
ReferenceOpenHashSet<Long> setOfCurrentAuthors = new ReferenceOpenHashSet();
ReferenceOpenHashSet<Long> setOfAuthorsForThisJournal = journal2AuthorsMap.getOrDefault(journalIdAsLong, setOfCurrentAuthors);
for (String authorId : authorIds) {
try {
long authorIdLong = Long.parseLong(authorId);
setOfAuthorsForThisJournal.add(authorIdLong);
} catch (NumberFormatException e) {
System.out.println("error with author id, not long: " + authorId);
}
}
journal2AuthorsMap.put(journalIdAsLong, setOfAuthorsForThisJournal);
}
}
Upvotes: 0
Views: 275