Reputation:
I have wrote Java code to count sum of occurrences. It uses 2 .txt
files as input and gives words and frequencies as output.
I would also like to print, which file how many times contains a given word. Do you have any idea how to do this?
public class JavaApplication2
{
public static void main(String[] args) throws IOException
{
Path filePath1 = Paths.get("test.txt");
Path filePath2 = Paths.get("test2.txt");
Scanner readerL = new Scanner(filePath1);
Scanner readerR = new Scanner(filePath2);
String line1 = readerL.nextLine();
String line2 = readerR.nextLine();
String text = new String();
text=text.concat(line1).concat(line2);
String[] keys = text.split("[!.?:;\\s]");
String[] uniqueKeys;
int count = 0;
System.out.println(text);
uniqueKeys = getUniqueKeys(keys);
for(String key: uniqueKeys)
{
if(null == key)
{
break;
}
for(String s : keys)
{
if(key.equals(s))
{
count++;
}
}
System.out.println("["+key+"] frequency : "+count);
count=0;
}
}
private static String[] getUniqueKeys(String[] keys)
{
String[] uniqueKeys = new String[keys.length];
uniqueKeys[0] = keys[0];
int uniqueKeyIndex = 1;
boolean keyAlreadyExists = false;
for(int i=1; i<keys.length ; i++)
{
for(int j=0; j<=uniqueKeyIndex; j++)
{
if(keys[i].equals(uniqueKeys[j]))
{
keyAlreadyExists = true;
}
}
if(!keyAlreadyExists)
{
uniqueKeys[uniqueKeyIndex] = keys[i];
uniqueKeyIndex++;
}
keyAlreadyExists = false;
}
return uniqueKeys;
}
Upvotes: 2
Views: 1780
Reputation: 157
Late answer, however below code will count word frequency efficiently if there are multiple files
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicLong;
public class WordCounter implements Runnable {
private final Scanner scanner;
private Map<String, AtomicLong> sharedCounter;
public WordCounter(Scanner scanner, Map<String, AtomicLong> sharedCounter) {
this.scanner = scanner;
this.sharedCounter = sharedCounter;
}
public void run() {
if (scanner == null) {
return;
}
while (scanner.hasNext()) {
String word = scanner.next().toLowerCase();
sharedCounter.putIfAbsent(word, new AtomicLong(0));
sharedCounter.get(word).incrementAndGet();
}
}
public static void main(String[] args) throws IOException {
// Number of parallel thread to run
int THREAD_COUNT = 10;
List<Path> paths = new ArrayList<>();
// Add path
paths.add(Paths.get("test1.txt"));
paths.add(Paths.get("test2.txt"));
// Shared word counter
Map<String, AtomicLong> sharedCounter = new ConcurrentHashMap<>();
ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
for (Path path : paths) {
executor.execute(new WordCounter(new Scanner(path), sharedCounter));
}
executor.shutdown();
// Wait until all threads are finish
while (!executor.isTerminated()) {
}
System.out.println(sharedCounter);
}
}
Upvotes: 0
Reputation: 1346
import java.io.; import java.util.;
public class file1{
public static void main(String[] args) throws Exception{
HashMap<String,Integer> words_fre = new HashMap<String,Integer>();
HashSet<String> words = new HashSet<String>();
try{
File folder = new File("/home/jsrathore/Dropbox/Semester 6th/IR_Lab/lab_01/one");
File[] listOfFiles = folder.listFiles();
BufferedReader bufferedReader=null;
FileInputStream inputfilename=null;
BufferedWriter out= new BufferedWriter(new OutputStreamWriter(new FileOutputStream("outfilename.txt",false), "UTF-8"));
for(File file : listOfFiles){
inputfilename= new FileInputStream(file);
/*System.out.println(file); */
bufferedReader= new BufferedReader(new InputStreamReader(inputfilename, "UTF-8"));
String s;
while((s = bufferedReader.readLine()) != null){
/*System.out.println(line);*/
s = s.replaceAll("\\<.*?>"," ");
if(s.contains("॥") || s.contains(":")|| s.contains("।")||
s.contains(",")|| s.contains("!")|| s.contains("?")){
s=s.replace("॥"," ");
s=s.replace(":"," ");
s=s.replace("।"," ");
s=s.replace(","," ");
s=s.replace("!"," ");
s=s.replace("?"," ");
}
StringTokenizer st = new StringTokenizer(s," ");
while (st.hasMoreTokens()) {
/*out.write(st.nextToken()+"\n");*/
String str=(st.nextToken()).toString();
words.add(str);
}
for(String str : words){
if(words_fre.containsKey(str)){
int a = words_fre.get(str);
words_fre.put(str,a+1);
}else{
words_fre.put(str,1);/*uwords++;//unique words count */
}
}
words.clear();
/*out.write("\n");
out.close();*/
}
Object[] key = words_fre.keySet().toArray();
Arrays.sort(key);
for (int i = 0; i < key.length; i++) {
//System.out.println(key[i]+"= "+words_fre.get(key[i]));
out.write(key[i]+" : "+words_fre.get(key[i]) +"\n");
}
}
out.close();
bufferedReader.close();
}catch(FileNotFoundException ex){
System.out.println("Error in reading line");
}catch(IOException ex){
/*System.out.println("Error in reading line"+fileReader );*/
ex.printStackTrace();
}
} }
Upvotes: 0
Reputation: 839
UPDATE : code for word(s) occurences (thanks @George)
This example is for a file, you can use it for multiple files :
public class MyTest {
Map<String,Integer> mapTable;
public MyTest(List<String> wordList){
//initialize map
makeMap(wordList);
}
public void makeMap(List<String> wordList){
mapTable = new HashMap();
for(int i = 0; i < wordList.size(); i++){
//fill the map up
mapTable.put(wordList.get(i), 0);
}
}
//update occurences in a map
public void updateMap(String [] _words){
for(int i = 0; i < _words.length; i++){
updateWordCount(_words[i]);
}
}
public void updateWordCount(String _word){
int value = 0;
//check if a word present
if(mapTable.containsKey(_word)){
value = mapTable.get(_word);
value++;
mapTable.put(_word, value);
}
}
public void DisplayCounts(){
for( String key : mapTable.keySet()){
System.out.println("Word : "+key+"\t Occurrence(s) :"+mapTable.get(key));
}
}
public void getWordCount(){
String filePath = "C:\\Users\\Jyo\\Desktop\\help.txt";
String line = "";
try {
// FileReader reads text files in the default encoding.
FileReader fileReader = new FileReader(filePath);
// Always wrap FileReader in BufferedReader.
BufferedReader bufferedReader = new BufferedReader(fileReader);
String _words[] = null;
while((line = bufferedReader.readLine()) != null) {
System.out.println(line);
_words = line.split(" ");
updateMap(_words);
}
// Always close files.
bufferedReader.close();
} catch (Exception e) {
System.out.println("Error :"+e.getMessage());
}
}
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
// TODO code application logic here
List<String> wordList = new ArrayList<>();
wordList.add("data");
wordList.add("select");
MyTest mt = new MyTest(wordList);
mt.getWordCount();
mt.DisplayCounts();
}
}
Upvotes: 0
Reputation: 915
Firstly, instead of using an array for unique keys, use a HashMap<String, Integer>
. It's a lot more efficient.
Your best option is to run your processing over each line/file separately, and store these counts separately. Then merge the two counts to get the overall frequencies.
More Detail:
String[] keys = text.split("[!.?:;\\s]");
HashMap<String,Integer> uniqueKeys = new HashMap<>();
for(String key : keys){
if(uniqueKeys.containsKey(key)){
// if your keys is already in map, increment count of it
uniqueKeys.put(key, uniqueKeys.get(map) + 1);
}else{
// if it isn't in it, add it
uniqueKeys.put(key, 1);
}
}
// You now have the count of all unique keys in a given text
// To print them to console
for(Entry<String, Integer> keyCount : uniqueKeys.getEntrySet()){
System.out.println(keyCount.getKey() + ": " + keyCount.getValue());
}
// To merge, if you're using Java 8
for(Entry<String, Integer> keyEntry : uniqueKeys1.getEntrySet()){
uniqueKeys2.merge(keyEntry.getKey(), keyEntry.getValue(), Integer::add);
}
// To merge, otherwise
for(Entry<String, Integer> keyEntry : uniqueKeys1.getEntrySet()){
if(uniqueKeys2.containsKey()){
uniqueKeys2.put(keyEntry.getKey(),
uniqueKeys2.get(keyEntry.getKey()) + keyEntry.getValue());
}else{
uniqueKeys2.put(keyEntry.getKey(), keyEntry.getValue());
}
}
Upvotes: 1