aripy887
aripy887

Reputation: 105

http socket request returning trailling characters

I am learning network programming in C and tried to create to a toy version of wget.

However, when I run the program, I get the page with some trailing characters at the beginning and at the end (0 & f43 in this case).

The program contains two .c and two .h files.

One for parsing (naïvely) the address and the other to make the network request and dump the data.

Here are the files for parsing the input:

url.h

#ifndef URL_H
#define URL_H

/* information of an URL*/
struct url_info
{
    char* url; //full url
    char* protocol; // protocol type: http, ftp, etc...
    char* host; // host name
    int port;   //port number
    char* path; //path
};
typedef struct url_info url_info;

static const char P_HTTP[] = "http";

void parse_url(char* url, url_info *info);

void exit_with_error(char* message);

void print_url_info(url_info info);

#endif //URL_H

url.c

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"url.h"


void parse_url(char* url, url_info *info)
{
    // url format: [http://]<hostname>[:<port>]/<path>

    char *full_url = malloc((strlen(url) + 1) * sizeof(char));
  char *protocol;
  char *path;
  char *host;
  int port;

  strcpy(full_url, url);
    info->url = full_url;

  char *protocol_token = strstr(url, "://");
  if (protocol_token){
    protocol = url;
    *protocol_token = '\0';
    url = protocol_token + 3;
  } else {
    protocol = "http";
  }
    info->protocol = protocol;

  char *port_token = strstr(url, ":");
  char *path_token = strstr(url, "/");

  if (port_token && port_token < path_token){
        port = atoi(port_token + 1);
        *port_token = '\0';
  } else {
    port = 80;
  }
    info->port = port;

  if (path_token){
    *path_token = '\0';
    host = url;
    path = path_token + 1;
        info->host = host;
        info->path = path;
  } else {
    exit_with_error("No trailing /.");
  }
}


void print_url_info(url_info info){
    printf("The URL contains following information: \n");
    printf("Full url:\t%s\n", info.url);
    printf("Protocol type:\t%s\n", info.protocol);
    printf("Host name:\t%s\n", info.host);
    printf("Port No.:\t%d\n", info.port);
    printf("Path:\t\t%s\n", info.path);
}


void exit_with_error(char *message)
{
    fprintf(stderr, "%s\n", message);
    exit(EXIT_FAILURE);
}

Here are the files for making the request

wgetX.h

#ifndef WGETX_H_
#define WGETX_H_

#define B_SIZE 1024 * 5000

void write_data(const char *path, const char *data);

char* download_page(url_info info, char *buff);

char* http_get_request(char* path, char* host);

char* read_http_reply(char* recv_buf_t);

unsigned long ipfromhost(const char *host);

#endif

wgetX.c

 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <unistd.h>
 #include <netdb.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <string.h>

#include "url.h"
#include "wgetX.h"

int main(int argc, char* argv[])
{

    url_info info;


    if (argc != 2) {
        exit_with_error("The wgetX must have exactly 1 parameter as input. \n");
    }
    char *url = argv[1];
    parse_url(url, &info);

    char *buf;
    buf = malloc(sizeof(char)*B_SIZE);
    bzero(buf, B_SIZE);

    download_page(info, buf);
  printf("%s", buf);

    free(buf);
    return (EXIT_SUCCESS);
}

char* download_page(url_info info, char *buf)
{
    struct sockaddr_in dest;
    int len, sz, mysocket;
    char *request = http_get_request(info.path, info.host);

    mysocket = socket(AF_INET, SOCK_STREAM, 0);
    memset(&dest, 0, sizeof(dest));
    dest.sin_family = AF_INET;
    dest.sin_addr.s_addr = ipfromhost(info.host);
    dest.sin_port = htons(info.port);
    connect(mysocket, (struct sockaddr *)&dest, sizeof(struct sockaddr));
    send(mysocket, request, strlen(request), 0);

    len = 0;
    sz = 0;
    do {
        len = recv(mysocket, buf + sz, B_SIZE - sz, 0);
        if (len == -1) {continue;}
        sz += len;
    } while (len > 0);

    *(buf + sz) = '\0';

  close(mysocket);
    return buf;
}


char* http_get_request(char* path, char* host) {
    char * request_buffer = (char *) malloc(1024);
    memset(request_buffer, 0, sizeof(*request_buffer));
    snprintf(request_buffer, 1024, "GET /%s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n",
            path, host);
    return request_buffer;
}

unsigned long ipfromhost(const char *host){
  struct in_addr **addr_list;
  struct hostent *he;
  if ((he = gethostbyname(host)) != NULL){
    addr_list = (struct in_addr **) he->h_addr_list;
    int i;
    for (i = 0; addr_list[i] != NULL; i++){
      return addr_list[i]->s_addr;
    }
    exit_with_error("Couldn't resolve host to ip adress\n");
    return 0;
  } else {
    exit_with_error("Couldn't resolve host to ip adress\n");
    return 0;
  }
}

Makefile

LINK_TARGET = wgetX
OBJS = \
    wgetX.o \
    url.o
REBUILDABLES = $(OBJS) $(LINK_TARGET)
all : $(LINK_TARGET)
clean:
    rm -f $(REBUILDABLES)

$(LINK_TARGET) : $(OBJS)
    cc -g -o $@ $^

%.o : %.c
    cc -g  -Wall -o $@ -c $<

wgetX.o : wgetX.h url.h
url.o : url.h

When executing the program on one specific url, I get an html output that is different from the source code (as found in Chrome). I get garbage characters: a zero at the end and "f43" just before the start of the html

Commands

make clean
make
./wgetX http://www.google.com/

Output

I got the http reply message with the status code and all and just before "

Upvotes: 0

Views: 177

Answers (1)

Steffen Ullrich
Steffen Ullrich

Reputation: 123320

I get garbage characters: a zero at the end and "f43" just before the start of the html

Welcome to the wonderful world of HTTP. Please note that HTTP is not a trivial protocol even though it might look like this. It should say something that the HTTP/1.1 standard as initially published in RFC 2616 has 176 pages text.

What you likely see here is chunked transfer encoding of the content. In this encoding the content is not transferred as one single piece but in several chunks, each prefixed by the length (in hex). I.e. something like this:

 HTTP/1.1 200 ok
 Transfer-Encoding: chunked

 a
 0123456789
 12
 These are 18 bytes
 0

In your specific case the initial f43 "just before the start of the html" is the length of the following chunk (f43 in hex of 3907 in decimal) and the "zero at the end" is the length of the final chunk (0).

For more on this see section 3.6.1 in RFC 2616 or section 4.1 in RFC 7230.

Upvotes: 1

Related Questions