Reputation: 105
I am learning network programming in C and tried to create to a toy version of wget.
However, when I run the program, I get the page with some trailing characters at the beginning and at the end (0 & f43 in this case).
The program contains two .c and two .h files.
One for parsing (naïvely) the address and the other to make the network request and dump the data.
Here are the files for parsing the input:
url.h
#ifndef URL_H
#define URL_H
/* information of an URL*/
struct url_info
{
char* url; //full url
char* protocol; // protocol type: http, ftp, etc...
char* host; // host name
int port; //port number
char* path; //path
};
typedef struct url_info url_info;
static const char P_HTTP[] = "http";
void parse_url(char* url, url_info *info);
void exit_with_error(char* message);
void print_url_info(url_info info);
#endif //URL_H
url.c
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"url.h"
void parse_url(char* url, url_info *info)
{
// url format: [http://]<hostname>[:<port>]/<path>
char *full_url = malloc((strlen(url) + 1) * sizeof(char));
char *protocol;
char *path;
char *host;
int port;
strcpy(full_url, url);
info->url = full_url;
char *protocol_token = strstr(url, "://");
if (protocol_token){
protocol = url;
*protocol_token = '\0';
url = protocol_token + 3;
} else {
protocol = "http";
}
info->protocol = protocol;
char *port_token = strstr(url, ":");
char *path_token = strstr(url, "/");
if (port_token && port_token < path_token){
port = atoi(port_token + 1);
*port_token = '\0';
} else {
port = 80;
}
info->port = port;
if (path_token){
*path_token = '\0';
host = url;
path = path_token + 1;
info->host = host;
info->path = path;
} else {
exit_with_error("No trailing /.");
}
}
void print_url_info(url_info info){
printf("The URL contains following information: \n");
printf("Full url:\t%s\n", info.url);
printf("Protocol type:\t%s\n", info.protocol);
printf("Host name:\t%s\n", info.host);
printf("Port No.:\t%d\n", info.port);
printf("Path:\t\t%s\n", info.path);
}
void exit_with_error(char *message)
{
fprintf(stderr, "%s\n", message);
exit(EXIT_FAILURE);
}
Here are the files for making the request
wgetX.h
#ifndef WGETX_H_
#define WGETX_H_
#define B_SIZE 1024 * 5000
void write_data(const char *path, const char *data);
char* download_page(url_info info, char *buff);
char* http_get_request(char* path, char* host);
char* read_http_reply(char* recv_buf_t);
unsigned long ipfromhost(const char *host);
#endif
wgetX.c
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "url.h"
#include "wgetX.h"
int main(int argc, char* argv[])
{
url_info info;
if (argc != 2) {
exit_with_error("The wgetX must have exactly 1 parameter as input. \n");
}
char *url = argv[1];
parse_url(url, &info);
char *buf;
buf = malloc(sizeof(char)*B_SIZE);
bzero(buf, B_SIZE);
download_page(info, buf);
printf("%s", buf);
free(buf);
return (EXIT_SUCCESS);
}
char* download_page(url_info info, char *buf)
{
struct sockaddr_in dest;
int len, sz, mysocket;
char *request = http_get_request(info.path, info.host);
mysocket = socket(AF_INET, SOCK_STREAM, 0);
memset(&dest, 0, sizeof(dest));
dest.sin_family = AF_INET;
dest.sin_addr.s_addr = ipfromhost(info.host);
dest.sin_port = htons(info.port);
connect(mysocket, (struct sockaddr *)&dest, sizeof(struct sockaddr));
send(mysocket, request, strlen(request), 0);
len = 0;
sz = 0;
do {
len = recv(mysocket, buf + sz, B_SIZE - sz, 0);
if (len == -1) {continue;}
sz += len;
} while (len > 0);
*(buf + sz) = '\0';
close(mysocket);
return buf;
}
char* http_get_request(char* path, char* host) {
char * request_buffer = (char *) malloc(1024);
memset(request_buffer, 0, sizeof(*request_buffer));
snprintf(request_buffer, 1024, "GET /%s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n",
path, host);
return request_buffer;
}
unsigned long ipfromhost(const char *host){
struct in_addr **addr_list;
struct hostent *he;
if ((he = gethostbyname(host)) != NULL){
addr_list = (struct in_addr **) he->h_addr_list;
int i;
for (i = 0; addr_list[i] != NULL; i++){
return addr_list[i]->s_addr;
}
exit_with_error("Couldn't resolve host to ip adress\n");
return 0;
} else {
exit_with_error("Couldn't resolve host to ip adress\n");
return 0;
}
}
Makefile
LINK_TARGET = wgetX
OBJS = \
wgetX.o \
url.o
REBUILDABLES = $(OBJS) $(LINK_TARGET)
all : $(LINK_TARGET)
clean:
rm -f $(REBUILDABLES)
$(LINK_TARGET) : $(OBJS)
cc -g -o $@ $^
%.o : %.c
cc -g -Wall -o $@ -c $<
wgetX.o : wgetX.h url.h
url.o : url.h
When executing the program on one specific url, I get an html output that is different from the source code (as found in Chrome). I get garbage characters: a zero at the end and "f43" just before the start of the html
Commands
make clean
make
./wgetX http://www.google.com/
Output
I got the http reply message with the status code and all and just before "
Upvotes: 0
Views: 177
Reputation: 123320
I get garbage characters: a zero at the end and "f43" just before the start of the html
Welcome to the wonderful world of HTTP. Please note that HTTP is not a trivial protocol even though it might look like this. It should say something that the HTTP/1.1 standard as initially published in RFC 2616 has 176 pages text.
What you likely see here is chunked transfer encoding of the content. In this encoding the content is not transferred as one single piece but in several chunks, each prefixed by the length (in hex). I.e. something like this:
HTTP/1.1 200 ok
Transfer-Encoding: chunked
a
0123456789
12
These are 18 bytes
0
In your specific case the initial f43 "just before the start of the html" is the length of the following chunk (f43 in hex of 3907 in decimal) and the "zero at the end" is the length of the final chunk (0).
For more on this see section 3.6.1 in RFC 2616 or section 4.1 in RFC 7230.
Upvotes: 1