fryeguy
fryeguy

Reputation: 953

Trouble reading from a file. Appears to reach EOF prematurely

I am certain that this problem can be solved with relative ease but I am struggling to find the problem. My code simply reads all the words from a file then stores each word, word position, start and end of sentence in an array. The array is output to another text file.

I can read in all the info up to the last sentence and then I have a bug. Any thoughts?

/**
 *  Programmer: fryeguy
 *  Course: 
 *  Program: TxtCrawl for MicroSearch
 *
 *  Algorithm:
 *  TxtCrawl is the component of MicroSearch that reads text
 *  documents for search terms and stores them for
 *  indexing
 *
 *  1. Count words in doc, then initialize
 *     wordsFromDoc array to wordCount
 *  2. Initiate output file for writing.
 *  3. Open input file for reading words.
 *  4. Until reaching EOF:
 *     4.a. Set value for start "get pointer" in startSentence (.tellg()).
 *     4.b. Store value for end "get pointer" in endSentence (.tellg()).
 *     4.c. Reset "get pointer" to startSentence location.
 *     4.d. Until reaching endSentence, Read into the
 *          array theWord, wordPos, startSent, and endSent
 *  5. Write wordsFromDoc array to file
 *  6. When EOF is reached close the files.
 */

#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>   

using namespace std;

struct wordProps        // stores word info to be placed in array
{
    string  theWord;    // stores the word
    int     wordPos;    // stores the position of word
    int     startSent;  // stores the start point of the sentence
    int     endSent;    // stores the end point of the sentence
};

void countWords(string, int&, int&);

int main()
{

    ifstream iFile; // file stream for reading in data
    ofstream oFile; // file stream for writing data

    string  iFileName = "TextFile2.txt";    // name of test file to read from
    string  oFileName = "OutputFile.txt";   // name of test file to write to
    string  aLine = "";                     // stores a line preceeding a newline character (\n)
    string  aWord = "";                     // stores words from doc for indexing
    int     charCount = 0;                  // count of characters in doc
    int     wordCount = 0;                  // count of words in doc
    int     aLineWordCount = 0;             // count of words in a single line being processed
    int     wordBegin = 0;                  // stores location of word in doc
    int     startSentence = 0;              // stores pointer value for start of sentence
    int     endSentence = 0;                // stores pointer value for end of sentence

    /**
     * 1. Count words in doc, then initialize
     *    wordsFromDoc array to wordCount
     */
    countWords(iFileName, charCount, wordCount);
    cout << "charCount: " << charCount << endl; // DEBUG CODE
    cout << "wordCount: " << wordCount << endl; // DEBUG CODE
    wordProps wordsFromDoc[wordCount];
    cout<< "length of array: " << (sizeof(wordsFromDoc) / sizeof(*wordsFromDoc)) << endl;  // DEBUG CODE

    /**
     * 2. Initiate output file for writing
     */
    oFile.open (oFileName.c_str()); // setup output file and write header
    oFile << setw(20) << left << "File Name: " << iFileName << endl;
    oFile << setw(20) << "---------------------------------------" << endl << endl;

    /**
     * 3. Open input file for reading words
     */
    iFile.open (iFileName.c_str());
    if (!iFile.is_open())
        cout << "No such file exists!" << endl;
    else
    {
        /**
         * 4. Until reaching EOF:
         */
        // I have been attempting different counting methods assuming the eof was being reached prematurely
        // The results really have not varied with this code
        // while (iFile.tellg() != charCount) 
        while (!iFile.eof())
        {
            //cout << "count: " << count << endl;
            /**
             * 4.a. Set value for start "get pointer" in startSentence (.tellg()).
             */
            startSentence = iFile.tellg();
            cout << "startSentence: " << startSentence << endl; // DEBUG CODE

            /**
             * 4.b. Store value for end "get pointer" in endSentence (.tellg()).
             */
            getline(iFile, aLine, '.');
            cout << aLine << endl; // DEBUG CODE
            endSentence = iFile.tellg();
            aLine.clear();
            cout << "endSentence: " << endSentence << endl; // DEBUG CODE

            if (!iFile.is_open())
            {
                cout << "The if, iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE
                iFile.close();
                iFile.open (iFileName.c_str());
            }

            /**
             * 4.c. Reset "get pointer" to startSentence location.
             */
            iFile.seekg(startSentence);
            cout << "iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE

            /**
             * 4.d. Until reaching endSentence, Read into the
             *      array theWord, wordPos, startSent, and endSent
             */

             // As the last line is about to be read there is an error of some sort.
             // My guess is that somehow I exceed the end of the file but my startSentence
             // and endSentence variables are pointing where I think they should.

            for ( ; iFile.tellg() < endSentence; aLineWordCount++)
            {
                wordsFromDoc[aLineWordCount].wordPos = iFile.tellg();
                cout << "wordPos: " << wordsFromDoc[aLineWordCount].wordPos << endl; // DEBUG CODE
                iFile >> wordsFromDoc[aLineWordCount].theWord;
                cout << "theWord: " << wordsFromDoc[aLineWordCount].theWord << endl; // DEBUG CODE
                wordsFromDoc[aLineWordCount].startSent = startSentence;
                cout << "startSent: " << wordsFromDoc[aLineWordCount].startSent << endl; // DEBUG CODE
                wordsFromDoc[aLineWordCount].endSent = endSentence;
                cout << "endSent: " << wordsFromDoc[aLineWordCount].endSent << endl << endl; // DEBUG CODE
                cout << "aLineWordCount: " << aLineWordCount << endl;
            } // end for

        } // end while !=iFile.eof

            // THIS section of code is never reached because of the hang up above.
            /**
             * 5. Write wordsFromDoc array to file
             */
            for (int count = 0; count < aLineWordCount; count++)
            {
                oFile << setw(20) << left
                << wordsFromDoc[count].theWord << " "
                << wordsFromDoc[count].wordPos << " "
                << wordsFromDoc[count].startSent << " "
                << wordsFromDoc[count].endSent << endl;
            }

    } // end else

    /**
     * 6. When EOF is reached close the files.
     */
    iFile.close();
    oFile.close();

// DEBUG CDODE for verifying results
//  for (int count = 0; count < wordCount; count++) {
//      cout << "theWord: " << wordsFromDoc[count].theWord << endl;
//      cout << "wordPos: " << wordsFromDoc[count].wordPos << endl;
//      cout << "startSent: " << wordsFromDoc[count].startSent << endl;
//      cout << "endSent: " << wordsFromDoc[count].endSent << endl << endl;
//  }

}

/**
 * Implement countWords function
 */
void countWords(string theFileName, int &charCount, int &wordCount)
{
    string  theWord = "";
    char    theChar = ' ';
    fstream inFile;

    //count the chars
    inFile.open (theFileName.c_str());
    if (!inFile.is_open())
        cout << "No such file exists!" << endl;
    else
    {
        inFile.get(theChar);
        while (!inFile.eof())
        {
            charCount++;
            inFile.get(theChar);
        }
    }
    inFile.close();

    // count the words
    inFile.open (theFileName.c_str());
    if (!inFile.is_open())
        cout << "No such file exists!" << endl;
    else
    {
        while (!inFile.eof())
        {
            inFile >> theWord;
            wordCount++;
        }
    }
    inFile.close();
}

Upvotes: 1

Views: 1945

Answers (1)

frayser
frayser

Reputation: 1772

Istream

I checked. Istream has no incarnation of get or getline handling multiple delimiters at once1.

Others have had the same issue2. Char-by-char IO was the most practical solution. Other solutions involved coding up enhanced versions of current Istream methods.

An Idea

  1. Read the full file into memory at once.
  2. Remove the newlines (any CR or LF).
  3. Split the document into lines ending at each of the special full stop delimiters, by placing a consistent marker(LF or ETX '\003') after each of those delimiters while writing the document back out to disk.
  4. Now the document can be processed as usual; but using the known marker instead of the period as the delimiter.
  5. Delete the temporary file holding the re-delimited document.

Reading the whole document at once is not issue because, it is all in memory eventually anyway; the strings that hold the words all together equal the entire document. Once a re-delimited document has been written out to disk, memory can be freed.

NOTES

1 Istream::get
2 Multiple delimiters with getline (discussion at Code Guru)

Upvotes: 1

Related Questions