Reputation: 953
I am certain that this problem can be solved with relative ease but I am struggling to find the problem. My code simply reads all the words from a file then stores each word, word position, start and end of sentence in an array. The array is output to another text file.
I can read in all the info up to the last sentence and then I have a bug. Any thoughts?
/**
* Programmer: fryeguy
* Course:
* Program: TxtCrawl for MicroSearch
*
* Algorithm:
* TxtCrawl is the component of MicroSearch that reads text
* documents for search terms and stores them for
* indexing
*
* 1. Count words in doc, then initialize
* wordsFromDoc array to wordCount
* 2. Initiate output file for writing.
* 3. Open input file for reading words.
* 4. Until reaching EOF:
* 4.a. Set value for start "get pointer" in startSentence (.tellg()).
* 4.b. Store value for end "get pointer" in endSentence (.tellg()).
* 4.c. Reset "get pointer" to startSentence location.
* 4.d. Until reaching endSentence, Read into the
* array theWord, wordPos, startSent, and endSent
* 5. Write wordsFromDoc array to file
* 6. When EOF is reached close the files.
*/
#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>
using namespace std;
struct wordProps // stores word info to be placed in array
{
string theWord; // stores the word
int wordPos; // stores the position of word
int startSent; // stores the start point of the sentence
int endSent; // stores the end point of the sentence
};
void countWords(string, int&, int&);
int main()
{
ifstream iFile; // file stream for reading in data
ofstream oFile; // file stream for writing data
string iFileName = "TextFile2.txt"; // name of test file to read from
string oFileName = "OutputFile.txt"; // name of test file to write to
string aLine = ""; // stores a line preceeding a newline character (\n)
string aWord = ""; // stores words from doc for indexing
int charCount = 0; // count of characters in doc
int wordCount = 0; // count of words in doc
int aLineWordCount = 0; // count of words in a single line being processed
int wordBegin = 0; // stores location of word in doc
int startSentence = 0; // stores pointer value for start of sentence
int endSentence = 0; // stores pointer value for end of sentence
/**
* 1. Count words in doc, then initialize
* wordsFromDoc array to wordCount
*/
countWords(iFileName, charCount, wordCount);
cout << "charCount: " << charCount << endl; // DEBUG CODE
cout << "wordCount: " << wordCount << endl; // DEBUG CODE
wordProps wordsFromDoc[wordCount];
cout<< "length of array: " << (sizeof(wordsFromDoc) / sizeof(*wordsFromDoc)) << endl; // DEBUG CODE
/**
* 2. Initiate output file for writing
*/
oFile.open (oFileName.c_str()); // setup output file and write header
oFile << setw(20) << left << "File Name: " << iFileName << endl;
oFile << setw(20) << "---------------------------------------" << endl << endl;
/**
* 3. Open input file for reading words
*/
iFile.open (iFileName.c_str());
if (!iFile.is_open())
cout << "No such file exists!" << endl;
else
{
/**
* 4. Until reaching EOF:
*/
// I have been attempting different counting methods assuming the eof was being reached prematurely
// The results really have not varied with this code
// while (iFile.tellg() != charCount)
while (!iFile.eof())
{
//cout << "count: " << count << endl;
/**
* 4.a. Set value for start "get pointer" in startSentence (.tellg()).
*/
startSentence = iFile.tellg();
cout << "startSentence: " << startSentence << endl; // DEBUG CODE
/**
* 4.b. Store value for end "get pointer" in endSentence (.tellg()).
*/
getline(iFile, aLine, '.');
cout << aLine << endl; // DEBUG CODE
endSentence = iFile.tellg();
aLine.clear();
cout << "endSentence: " << endSentence << endl; // DEBUG CODE
if (!iFile.is_open())
{
cout << "The if, iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE
iFile.close();
iFile.open (iFileName.c_str());
}
/**
* 4.c. Reset "get pointer" to startSentence location.
*/
iFile.seekg(startSentence);
cout << "iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE
/**
* 4.d. Until reaching endSentence, Read into the
* array theWord, wordPos, startSent, and endSent
*/
// As the last line is about to be read there is an error of some sort.
// My guess is that somehow I exceed the end of the file but my startSentence
// and endSentence variables are pointing where I think they should.
for ( ; iFile.tellg() < endSentence; aLineWordCount++)
{
wordsFromDoc[aLineWordCount].wordPos = iFile.tellg();
cout << "wordPos: " << wordsFromDoc[aLineWordCount].wordPos << endl; // DEBUG CODE
iFile >> wordsFromDoc[aLineWordCount].theWord;
cout << "theWord: " << wordsFromDoc[aLineWordCount].theWord << endl; // DEBUG CODE
wordsFromDoc[aLineWordCount].startSent = startSentence;
cout << "startSent: " << wordsFromDoc[aLineWordCount].startSent << endl; // DEBUG CODE
wordsFromDoc[aLineWordCount].endSent = endSentence;
cout << "endSent: " << wordsFromDoc[aLineWordCount].endSent << endl << endl; // DEBUG CODE
cout << "aLineWordCount: " << aLineWordCount << endl;
} // end for
} // end while !=iFile.eof
// THIS section of code is never reached because of the hang up above.
/**
* 5. Write wordsFromDoc array to file
*/
for (int count = 0; count < aLineWordCount; count++)
{
oFile << setw(20) << left
<< wordsFromDoc[count].theWord << " "
<< wordsFromDoc[count].wordPos << " "
<< wordsFromDoc[count].startSent << " "
<< wordsFromDoc[count].endSent << endl;
}
} // end else
/**
* 6. When EOF is reached close the files.
*/
iFile.close();
oFile.close();
// DEBUG CDODE for verifying results
// for (int count = 0; count < wordCount; count++) {
// cout << "theWord: " << wordsFromDoc[count].theWord << endl;
// cout << "wordPos: " << wordsFromDoc[count].wordPos << endl;
// cout << "startSent: " << wordsFromDoc[count].startSent << endl;
// cout << "endSent: " << wordsFromDoc[count].endSent << endl << endl;
// }
}
/**
* Implement countWords function
*/
void countWords(string theFileName, int &charCount, int &wordCount)
{
string theWord = "";
char theChar = ' ';
fstream inFile;
//count the chars
inFile.open (theFileName.c_str());
if (!inFile.is_open())
cout << "No such file exists!" << endl;
else
{
inFile.get(theChar);
while (!inFile.eof())
{
charCount++;
inFile.get(theChar);
}
}
inFile.close();
// count the words
inFile.open (theFileName.c_str());
if (!inFile.is_open())
cout << "No such file exists!" << endl;
else
{
while (!inFile.eof())
{
inFile >> theWord;
wordCount++;
}
}
inFile.close();
}
Upvotes: 1
Views: 1945
Reputation: 1772
Istream
I checked. Istream has no incarnation of get or getline handling multiple delimiters at once1.
Others have had the same issue2. Char-by-char IO was the most practical solution. Other solutions involved coding up enhanced versions of current Istream methods.
An Idea
Reading the whole document at once is not issue because, it is all in memory eventually anyway; the strings that hold the words all together equal the entire document. Once a re-delimited document has been written out to disk, memory can be freed.
NOTES
1 Istream::get
2 Multiple delimiters with getline (discussion at Code Guru)
Upvotes: 1