Kevin Yan
Kevin Yan

Reputation: 17

Get HTML of url link with Visual Studio

The code below works for the main site, but it does not work for the sub-links. For example, the code works when the url link is "www.reddit.com" or "www.wikipedia.org" but it will not work when it is "www.reddit.com/r/nba" or "www.wikipedia.org/wiki/2007_NASCAR_Nextel_Cup_Series" Do you know how to make it work to include sub-links as well?

 #include "stdafx.h"
#include <winsock2.h>
#include <WS2tcpip.h>
#include <windows.h>
#include <iostream>
#pragma comment(lib,"ws2_32.lib")
using namespace std;
int main(){
    // Initialize Dependencies to the Windows Socket.
    WSADATA wsaData;
    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
        cout << "WSAStartup failed.\n";
        system("pause");
        return -1;
    }

    // We first prepare some "hints" for the "getaddrinfo" function
    // to tell it, that we are looking for a IPv4 TCP Connection.
    struct addrinfo hints;
    ZeroMemory(&hints, sizeof(hints));
    hints.ai_family = AF_INET;          // We are targeting IPv4
    hints.ai_protocol = IPPROTO_TCP;    // We are targeting TCP
    hints.ai_socktype = SOCK_STREAM;    // We are targeting TCP so its SOCK_STREAM

    // Aquiring of the IPv4 address of a host using the newer
    // "getaddrinfo" function which outdated "gethostbyname".
    // It will search for IPv4 addresses using the TCP-Protocol.
    struct addrinfo* targetAdressInfo = NULL;
    DWORD getAddrRes = getaddrinfo("www.wikipedia.org/wiki/2007_NASCAR_Nextel_Cup_Series", NULL, &hints, &targetAdressInfo);
    if (getAddrRes != 0 || targetAdressInfo == NULL)
    {
        cout << "Could not resolve the Host Name" << endl;
        system("pause");
        WSACleanup();
        return -1;
    }

    // Create the Socket Address Informations, using IPv4
    // We dont have to take care of sin_zero, it is only used to extend the length of SOCKADDR_IN to the size of SOCKADDR
    SOCKADDR_IN sockAddr;
    sockAddr.sin_addr = ((struct sockaddr_in*) targetAdressInfo->ai_addr)->sin_addr;    // The IPv4 Address from the Address Resolution Result
    sockAddr.sin_family = AF_INET;  // IPv4
    sockAddr.sin_port = htons(80);  // HTTP Port: 80

    // We have to free the Address-Information from getaddrinfo again
    freeaddrinfo(targetAdressInfo);

    // Creation of a socket for the communication with the Web Server,
    // using IPv4 and the TCP-Protocol
    SOCKET webSocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
    if (webSocket == INVALID_SOCKET)
    {
        cout << "Creation of the Socket Failed" << endl;
        system("pause");
        WSACleanup();
        return -1;
    }

    // Establishing a connection to the web Socket
    cout << "Connecting...\n";
    if (connect(webSocket, (SOCKADDR*)&sockAddr, sizeof(sockAddr)) != 0)
    {
        cout << "Could not connect";
        system("pause");
        closesocket(webSocket);
        WSACleanup();
        return -1;
    }
    cout << "Connected.\n";

    // Sending a HTTP-GET-Request to the Web Server
    const char* httpRequest = "GET / HTTP/1.1\r\nHost: www.google.com\r\nConnection: close\r\n\r\n";
    int sentBytes = send(webSocket, httpRequest, strlen(httpRequest), 0);
    if (sentBytes < strlen(httpRequest) || sentBytes == SOCKET_ERROR)
    {
        cout << "Could not send the request to the Server" << endl;
        system("pause");
        closesocket(webSocket);
        WSACleanup();
        return -1;
    }

    // Receiving and Displaying an answer from the Web Server
    char buffer[10000];
    ZeroMemory(buffer, sizeof(buffer));
    int dataLen;
    while ((dataLen = recv(webSocket, buffer, sizeof(buffer), 0) > 0))
    {
        int i = 0;
        while (buffer[i] >= 32 || buffer[i] == '\n' || buffer[i] == '\r') {
            cout << buffer[i];
            i += 1;
        }
    }

    // Cleaning up Windows Socket Dependencies
    closesocket(webSocket);
    WSACleanup();

    system("pause");
    return 0;
}

Upvotes: 1

Views: 427

Answers (2)

Thomas Sparber
Thomas Sparber

Reputation: 2917

A URL is built by protocol, host and link. e.g. protocol://host/link --> http://www.wikipedia.org/wiki/2007_NASCAR_Nextel_Cup_Series. Here, http is the protocol, www.wikipedia.org is the host and /wiki/2007_NASCAR_Nextel_Cup_Series is the link.

Web Browsers internally split those parts up and then establish a Connection. e.g

  • http --> use port 80
  • www.wikipedia.org --> search for host and establish Connection
  • link (Here Comes the interesting part) --> Send the link through the TCP Connection as header

With a raw TCP Connection you can't directly open URL liks, you Need to send the link through the TCP Connection as Header. Or you use the Standart WebClients which do it for you.

I think the ultimate Problem is that you are trying to establish a TCP Connection with the entire URL instead of just the host.

Upvotes: 0

Oncaphillis
Oncaphillis

Reputation: 1908

You have to replace your GET / with GET /path/you/want/to/access.html and leave the server name as www.wikipedia.org but for heavens sake use WinHttpClient or Curl or something else more hi-level.

Upvotes: 1

Related Questions