Ubuntuer
Ubuntuer

Reputation: 31

http response from recv() is missing raw/binary data in the body C++ sockets

working on a http client that downloads data from a website using HTTP/1.1 (not https on port 443). The program is working for the most part, but when I attempt to write the data to a file, the data in the file does not match with the data when I physically save the file from a browser. Here is the code below:

#include <fstream>
#include <iostream>
#include <sys/socket.h>
#include <sys/types.h>
#include <netdb.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <string>
#include <cstring>

using namespace std;

string resolveToIPAddress(string, string);
int getPortNum(string);
string getUrlPath(string);
string getDomain(string);
string getDomainWithFile(string);
string getUrlWithNoPort(string);
string getFileFromURL(string);
string getFilePath(string);
bool hasHTTPProtocol(string);
bool hasPortNum(string);
bool checkIfValidProtocol(string);


int main(int argc, char ** argv) {
    //Create a file to be opened for output
    ofstream out_file;
    out_file.open("output", ios::out | ios::binary);
    int connection_port = 0, send_req, recv_req;
    string ip_addr = "", domain = "", ipFromDomain = "", file_to_get = "", http_response = "";
    
    ip_addr = argv[1]; //point ip address argument to the 2nd indice in the arguments

    //Check if http is valid
    if(hasHTTPProtocol(ip_addr) == true) {
             //if there is a port number, Get port number (extract it from URL)
        if(hasPortNum(ip_addr)) {
            connection_port = getPortNum(ip_addr);
            domain = getDomain(ip_addr);
        }
            //if there is no port number, set default port number = 80 and set domain to truncated ip address
        else {
            connection_port = 80;
            domain = getDomain(ip_addr);
        }
        
        file_to_get = getFileFromURL(ip_addr);
        //For Testing purposes only
        //cout << "Domain: " << domain << endl;
        //cout << getDomainWithFile(ip_addr) << endl;
        //cout << "URL without port: " << getUrlWithNoPort(ip_addr) << endl;
        //cout << "URL Path: " << getUrlPath(ip_addr);
        //cout << "Path and File: " << getFilePath(ip_addr) << endl;
        //cout << "Domain and File: " << getDomainWithFile(ip_addr) << endl;
        // cout << "Port Number: " << connection_port << endl;
        // cout << "File requested: " << file_to_get << endl;
        //resolve domain to ipAddress
        ipFromDomain = resolveToIPAddress(domain, to_string(connection_port));
        //Connect to iP Address with port and return metadata
        
        //Create the socket
        int http_client_socket = socket(AF_INET, SOCK_STREAM, 0);

        // connect address and contain socket
        struct sockaddr_in connection_addr;
        connection_addr.sin_family = AF_INET; //set the addressfamily type to INET 
        connection_addr.sin_port = htons(connection_port); //set socket to parsed port number
        //cout << "ip address: " << ipFromDomain << endl; //checking to see if ip address is well converted
        inet_aton(ipFromDomain.c_str(), &connection_addr.sin_addr); //convert ip address from IPV4 notation string and store it into structure

        //Connect to server address
        if(connect(http_client_socket, (struct sockaddr *) &connection_addr, sizeof(connection_addr)) != 0) {
            out_file << "NOCONNECTION" << endl;
            out_file.close();
            exit(0);
        }

        //Logic for HTTP GET Request
        string http_request = "GET /" + getFilePath(ip_addr) + " HTTP/1.1\r\nHost:" + domain + "\r\nConnection: close\r\n\r\n";
        //cout << http_request << endl;
        send_req = send(http_client_socket, http_request.c_str(), http_request.length(), 0);
        if ( send_req != http_request.length() ) { std::cerr << "Oops on send\n"; }
        char buff[BUFSIZ];
        int n;
        while ( (n=recv(http_client_socket, buff, sizeof(buff), 0)) > 0 ) {
            http_response.append(buff, n);
        }
        
        //Testing
        //cout << http_response << endl;

        //Test for 404, if there is a 404 then close the program and exit with "FILENOTFOUND" in output file
        if(http_response.find("HTTP/1.1 404") != string::npos) {
            out_file << "FILENOTFOUND" << endl;
            out_file.close();
            exit(0);
        }

        const char * http_body_data = strstr(http_response.c_str(), "\r\n\r\n");
        if(http_body_data) {
            http_body_data += 4;
        }

        //out_file << http_response;
        //close the file
        out_file << http_body_data;
        out_file.close();

        //close the socket
        close(http_client_socket);
    } else {
        out_file << "INVALIDPROTOCOL" << endl;
        out_file.close();
        exit(0);
    }

    return 0;
}

string getUrlWithNoPort(string url) {
    if(hasHTTPProtocol(url))
        return url.substr(7, url.length() - 7);
    return url;
}
//Get URL without port and path
string getDomain(string url) {
    string urlWithoutPortandPath = "";
    int i = 0;
    //Check if full URL has a protocol
    if (hasHTTPProtocol(url)) {
        //if it has a protocol truncate the protocol from FQDN
        i = 7;
        while (url[i] != '/') {

        
        //for (int i = 7; i < url.length(); i++) {
            if (url[i] == ':') {
                break;
            }
            urlWithoutPortandPath += url[i];
            i++;
        //}
        }
        return urlWithoutPortandPath;
    }
    
    //if it does not have a protocol remove port number and path
    while (url[i] != '/') {
    //for (int i = 0; i < url.length(); i++) {
        if (url[i] == ':') {
            break;
        }
        urlWithoutPortandPath += url[i];
        i++;
    } 
    return urlWithoutPortandPath;
}

string getDomainWithFile(string url) {
    string urlWithoutPortandPath = "";
    //Check if full URL has a protocol
    if (hasHTTPProtocol(url)) {
        //if it has a protocol truncate the protocol from FQDN
        for (int i = 7; i < url.length(); i++) {
            if (url[i] == ':') {
                break;
            }
            urlWithoutPortandPath += url[i];
        }
        return urlWithoutPortandPath;
    }
    
    //if it does not have a protocol remove port number and path
   
    for (int i = 0; i < url.length(); i++) {
        if (url[i] == ':') {
            break;
        }
        urlWithoutPortandPath += url[i];
    } 
    return urlWithoutPortandPath;
}

bool hasHTTPProtocol(string url) {
    string httpProtocol = url.substr(0, 7);
    if(httpProtocol == "http://")
        return true;
    return false;
}

int getPortNum(string url) {
    string port = "";
    int portNum, portIdx = 0, pathIdx = 0, portEndIdx = 0;
    if(hasHTTPProtocol(url)) {
        for(int i = 7; i < url.length(); i++) {
            if (url[i] == ':')
                portIdx = i + 1;
        }
    }
    

    string fromPortToPath = url.substr(portIdx, url.length() - portIdx);
    //cout << "Port to Path: " << fromPortToPath << endl;

    for (int i = 0; i < fromPortToPath.length(); i++) {
        if (fromPortToPath[i] == '/') {
            pathIdx = i + 1;
            portEndIdx = i;
            break;
        }
    }
    port = fromPortToPath.substr(0, portEndIdx);
    portNum = stoi(port);
    return portNum;
}

string getUrlPath(string url) {
    string urlPath = "";
    int pathIdx = 0, portIdx = 0, portEndIdx = 0;
    if(hasHTTPProtocol(url)) {
         for(int i = 7; i < url.length(); i++) {
            if (url[i] == ':')
                portIdx = i + 1;
        }
    }
   
    string fromPortToPath = url.substr(portIdx, url.length() - portIdx);
    cout << "Port to Path: " << fromPortToPath << endl;

    for (int i = 0; i < fromPortToPath.length(); i++) {
        if (fromPortToPath[i] == '/') {
            pathIdx = i + 1;
            portEndIdx = i;
            break;
        }
    }
    urlPath = fromPortToPath.substr(portEndIdx + 1, fromPortToPath.length() - pathIdx );
    return urlPath; 
}

bool hasPortNum(string url) {
    if(hasHTTPProtocol(url)) {
        for (int i = 7; i < url.length(); i++) {
            if (url[i] == ':')
                return true;
        }
    } else {
        for (int i = 0; i < url.length(); i++) {
            if (url[i] == ':')
                return true;
        }
    }
    
    return false;
}

//Resolves a string hostname e.g. google.com into an ipaddress (practically a DNS function)
string resolveToIPAddress(string urlString, string portNum) {
    struct addrinfo hints, *results;
    struct addrinfo *result;
    int error, sock_id;
    string numericalIPS[100];
    //set all bits in hints to zero
    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if((error = getaddrinfo(urlString.c_str(), portNum.c_str(), &hints, &results)) != 0) {
        cout << "error " << error << ":" << gai_strerror(error) << endl;
    }

    int i = 0;
    //loop through results
    for(result = results; result != nullptr; result = result->ai_next) {
        struct sockaddr_in *ip_addr;
        ip_addr = (struct sockaddr_in *)result->ai_addr;
        numericalIPS[i] = inet_ntoa(ip_addr->sin_addr);
        i++;
    }
    return numericalIPS[0];
}

string getFileFromURL(string url) {
    int idxToFile;
    string file_request;
    string path_to_file = getDomainWithFile(url);
    for (int i = 0; i < path_to_file.length(); i++) {
        if(path_to_file[i] == '/') {
            idxToFile = i + 1;
        }
    }
    file_request = path_to_file.substr(idxToFile, path_to_file.length() - idxToFile);
    return file_request;
}

string getFilePath(string url) {
    string domainPathAndFile = getDomainWithFile(url);
    string pathAndFile;
    int idxToPath;
    for (int i = 0; i < domainPathAndFile.length(); i++) {
        if(domainPathAndFile[i] == '/') {
            idxToPath = i + 1;
            break;
        }
    }
    pathAndFile = domainPathAndFile.substr(idxToPath, domainPathAndFile.length() - idxToPath);
    return pathAndFile;


}

I used this website as a testing medium: http://www.testingmcafeesites.com/testcat_ac.html

When I do the following:

wget http://www.testingmcafeesites.com/testcat_ac.html

I get the following:

<html>
      <head>
            <title>URL for testing - Category Art/Culure</title>
      </head>

      <body>
            <code>
              http://www.testingmcafeesites.com/testcat_ac.html<br>
              <br>

                  This is an example URL which should be categorized as an art/culture website with a minimal risk reputation score.<br>
                  This page simply displays this text without any specific content on it, it is just for testing purpose.<br>
                  <br>
                  <b>If you can see this text, it was not blocked by any filter!</b><br>
                  <br>
            </code>
      </body>
</html>

However, with my program, I get the following and there are spaces as well in the first couple of lines:

<html>
      <head>
            <title>URL for testing - Category Art/Culure</title>
      </head>

      <body>
            <code>
              http://www.testingmcafeesites.com/testcat_ac.html<br>
              <br>

                  This is an example URL which should be categorized as an art/culture website with a minimal risk reputation score.<br>
                  This page simply displays this text without any specific content on it, it is just for testing purpose.<br>
                  <br>
                  <b>If you can see this text, it was not blocked by any filter!</b><br>
                  <br>
            </code>
      </body>

Sadly, this platform does not illustrate the spaces. What can I do to improve the body of the http response in my code. I was able to remove the header of the response with strstr but I feel as if I am missing something else. Thanks!

Upvotes: 0

Views: 229

Answers (1)

3CxEZiVlQ
3CxEZiVlQ

Reputation: 38991

I suggest that you use

wget http://www.testingmcafeesites.com/testcat_ac.html -O testcat_ac.html

Open the file testcat_ac.html in a hex editor and you see it contains a lot of unprintable characters, null bytes. Dealing with such data as a C string http_response.c_str() is not productive. Use std::string equivalent functions.

Upvotes: 1

Related Questions