Reputation: 31
working on a http client that downloads data from a website using HTTP/1.1 (not https on port 443). The program is working for the most part, but when I attempt to write the data to a file, the data in the file does not match with the data when I physically save the file from a browser. Here is the code below:
#include <fstream>
#include <iostream>
#include <sys/socket.h>
#include <sys/types.h>
#include <netdb.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <string>
#include <cstring>
using namespace std;
string resolveToIPAddress(string, string);
int getPortNum(string);
string getUrlPath(string);
string getDomain(string);
string getDomainWithFile(string);
string getUrlWithNoPort(string);
string getFileFromURL(string);
string getFilePath(string);
bool hasHTTPProtocol(string);
bool hasPortNum(string);
bool checkIfValidProtocol(string);
int main(int argc, char ** argv) {
//Create a file to be opened for output
ofstream out_file;
out_file.open("output", ios::out | ios::binary);
int connection_port = 0, send_req, recv_req;
string ip_addr = "", domain = "", ipFromDomain = "", file_to_get = "", http_response = "";
ip_addr = argv[1]; //point ip address argument to the 2nd indice in the arguments
//Check if http is valid
if(hasHTTPProtocol(ip_addr) == true) {
//if there is a port number, Get port number (extract it from URL)
if(hasPortNum(ip_addr)) {
connection_port = getPortNum(ip_addr);
domain = getDomain(ip_addr);
}
//if there is no port number, set default port number = 80 and set domain to truncated ip address
else {
connection_port = 80;
domain = getDomain(ip_addr);
}
file_to_get = getFileFromURL(ip_addr);
//For Testing purposes only
//cout << "Domain: " << domain << endl;
//cout << getDomainWithFile(ip_addr) << endl;
//cout << "URL without port: " << getUrlWithNoPort(ip_addr) << endl;
//cout << "URL Path: " << getUrlPath(ip_addr);
//cout << "Path and File: " << getFilePath(ip_addr) << endl;
//cout << "Domain and File: " << getDomainWithFile(ip_addr) << endl;
// cout << "Port Number: " << connection_port << endl;
// cout << "File requested: " << file_to_get << endl;
//resolve domain to ipAddress
ipFromDomain = resolveToIPAddress(domain, to_string(connection_port));
//Connect to iP Address with port and return metadata
//Create the socket
int http_client_socket = socket(AF_INET, SOCK_STREAM, 0);
// connect address and contain socket
struct sockaddr_in connection_addr;
connection_addr.sin_family = AF_INET; //set the addressfamily type to INET
connection_addr.sin_port = htons(connection_port); //set socket to parsed port number
//cout << "ip address: " << ipFromDomain << endl; //checking to see if ip address is well converted
inet_aton(ipFromDomain.c_str(), &connection_addr.sin_addr); //convert ip address from IPV4 notation string and store it into structure
//Connect to server address
if(connect(http_client_socket, (struct sockaddr *) &connection_addr, sizeof(connection_addr)) != 0) {
out_file << "NOCONNECTION" << endl;
out_file.close();
exit(0);
}
//Logic for HTTP GET Request
string http_request = "GET /" + getFilePath(ip_addr) + " HTTP/1.1\r\nHost:" + domain + "\r\nConnection: close\r\n\r\n";
//cout << http_request << endl;
send_req = send(http_client_socket, http_request.c_str(), http_request.length(), 0);
if ( send_req != http_request.length() ) { std::cerr << "Oops on send\n"; }
char buff[BUFSIZ];
int n;
while ( (n=recv(http_client_socket, buff, sizeof(buff), 0)) > 0 ) {
http_response.append(buff, n);
}
//Testing
//cout << http_response << endl;
//Test for 404, if there is a 404 then close the program and exit with "FILENOTFOUND" in output file
if(http_response.find("HTTP/1.1 404") != string::npos) {
out_file << "FILENOTFOUND" << endl;
out_file.close();
exit(0);
}
const char * http_body_data = strstr(http_response.c_str(), "\r\n\r\n");
if(http_body_data) {
http_body_data += 4;
}
//out_file << http_response;
//close the file
out_file << http_body_data;
out_file.close();
//close the socket
close(http_client_socket);
} else {
out_file << "INVALIDPROTOCOL" << endl;
out_file.close();
exit(0);
}
return 0;
}
string getUrlWithNoPort(string url) {
if(hasHTTPProtocol(url))
return url.substr(7, url.length() - 7);
return url;
}
//Get URL without port and path
string getDomain(string url) {
string urlWithoutPortandPath = "";
int i = 0;
//Check if full URL has a protocol
if (hasHTTPProtocol(url)) {
//if it has a protocol truncate the protocol from FQDN
i = 7;
while (url[i] != '/') {
//for (int i = 7; i < url.length(); i++) {
if (url[i] == ':') {
break;
}
urlWithoutPortandPath += url[i];
i++;
//}
}
return urlWithoutPortandPath;
}
//if it does not have a protocol remove port number and path
while (url[i] != '/') {
//for (int i = 0; i < url.length(); i++) {
if (url[i] == ':') {
break;
}
urlWithoutPortandPath += url[i];
i++;
}
return urlWithoutPortandPath;
}
string getDomainWithFile(string url) {
string urlWithoutPortandPath = "";
//Check if full URL has a protocol
if (hasHTTPProtocol(url)) {
//if it has a protocol truncate the protocol from FQDN
for (int i = 7; i < url.length(); i++) {
if (url[i] == ':') {
break;
}
urlWithoutPortandPath += url[i];
}
return urlWithoutPortandPath;
}
//if it does not have a protocol remove port number and path
for (int i = 0; i < url.length(); i++) {
if (url[i] == ':') {
break;
}
urlWithoutPortandPath += url[i];
}
return urlWithoutPortandPath;
}
bool hasHTTPProtocol(string url) {
string httpProtocol = url.substr(0, 7);
if(httpProtocol == "http://")
return true;
return false;
}
int getPortNum(string url) {
string port = "";
int portNum, portIdx = 0, pathIdx = 0, portEndIdx = 0;
if(hasHTTPProtocol(url)) {
for(int i = 7; i < url.length(); i++) {
if (url[i] == ':')
portIdx = i + 1;
}
}
string fromPortToPath = url.substr(portIdx, url.length() - portIdx);
//cout << "Port to Path: " << fromPortToPath << endl;
for (int i = 0; i < fromPortToPath.length(); i++) {
if (fromPortToPath[i] == '/') {
pathIdx = i + 1;
portEndIdx = i;
break;
}
}
port = fromPortToPath.substr(0, portEndIdx);
portNum = stoi(port);
return portNum;
}
string getUrlPath(string url) {
string urlPath = "";
int pathIdx = 0, portIdx = 0, portEndIdx = 0;
if(hasHTTPProtocol(url)) {
for(int i = 7; i < url.length(); i++) {
if (url[i] == ':')
portIdx = i + 1;
}
}
string fromPortToPath = url.substr(portIdx, url.length() - portIdx);
cout << "Port to Path: " << fromPortToPath << endl;
for (int i = 0; i < fromPortToPath.length(); i++) {
if (fromPortToPath[i] == '/') {
pathIdx = i + 1;
portEndIdx = i;
break;
}
}
urlPath = fromPortToPath.substr(portEndIdx + 1, fromPortToPath.length() - pathIdx );
return urlPath;
}
bool hasPortNum(string url) {
if(hasHTTPProtocol(url)) {
for (int i = 7; i < url.length(); i++) {
if (url[i] == ':')
return true;
}
} else {
for (int i = 0; i < url.length(); i++) {
if (url[i] == ':')
return true;
}
}
return false;
}
//Resolves a string hostname e.g. google.com into an ipaddress (practically a DNS function)
string resolveToIPAddress(string urlString, string portNum) {
struct addrinfo hints, *results;
struct addrinfo *result;
int error, sock_id;
string numericalIPS[100];
//set all bits in hints to zero
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if((error = getaddrinfo(urlString.c_str(), portNum.c_str(), &hints, &results)) != 0) {
cout << "error " << error << ":" << gai_strerror(error) << endl;
}
int i = 0;
//loop through results
for(result = results; result != nullptr; result = result->ai_next) {
struct sockaddr_in *ip_addr;
ip_addr = (struct sockaddr_in *)result->ai_addr;
numericalIPS[i] = inet_ntoa(ip_addr->sin_addr);
i++;
}
return numericalIPS[0];
}
string getFileFromURL(string url) {
int idxToFile;
string file_request;
string path_to_file = getDomainWithFile(url);
for (int i = 0; i < path_to_file.length(); i++) {
if(path_to_file[i] == '/') {
idxToFile = i + 1;
}
}
file_request = path_to_file.substr(idxToFile, path_to_file.length() - idxToFile);
return file_request;
}
string getFilePath(string url) {
string domainPathAndFile = getDomainWithFile(url);
string pathAndFile;
int idxToPath;
for (int i = 0; i < domainPathAndFile.length(); i++) {
if(domainPathAndFile[i] == '/') {
idxToPath = i + 1;
break;
}
}
pathAndFile = domainPathAndFile.substr(idxToPath, domainPathAndFile.length() - idxToPath);
return pathAndFile;
}
I used this website as a testing medium: http://www.testingmcafeesites.com/testcat_ac.html
When I do the following:
wget http://www.testingmcafeesites.com/testcat_ac.html
I get the following:
<html>
<head>
<title>URL for testing - Category Art/Culure</title>
</head>
<body>
<code>
http://www.testingmcafeesites.com/testcat_ac.html<br>
<br>
This is an example URL which should be categorized as an art/culture website with a minimal risk reputation score.<br>
This page simply displays this text without any specific content on it, it is just for testing purpose.<br>
<br>
<b>If you can see this text, it was not blocked by any filter!</b><br>
<br>
</code>
</body>
</html>
However, with my program, I get the following and there are spaces as well in the first couple of lines:
<html>
<head>
<title>URL for testing - Category Art/Culure</title>
</head>
<body>
<code>
http://www.testingmcafeesites.com/testcat_ac.html<br>
<br>
This is an example URL which should be categorized as an art/culture website with a minimal risk reputation score.<br>
This page simply displays this text without any specific content on it, it is just for testing purpose.<br>
<br>
<b>If you can see this text, it was not blocked by any filter!</b><br>
<br>
</code>
</body>
Sadly, this platform does not illustrate the spaces. What can I do to improve the body of the http response in my code. I was able to remove the header of the response with strstr but I feel as if I am missing something else. Thanks!
Upvotes: 0
Views: 229
Reputation: 38991
I suggest that you use
wget http://www.testingmcafeesites.com/testcat_ac.html -O testcat_ac.html
Open the file testcat_ac.html
in a hex editor and you see it contains a lot of unprintable characters, null bytes. Dealing with such data as a C string http_response.c_str()
is not productive. Use std::string
equivalent functions.
Upvotes: 1