sssbot
sssbot

Reputation: 3

Unable to fetch data using HttpWebRequest or HtmlAgilityPack

I am trying to make web scraper in C# for NSE. The code works with other sites but when ran on https://www.nseindia.com/ it gives error - An error occurred while sending the request. Unable to read data from the transport connection: Operation timed out.

I have tried with two different approaches Try1() & Try2(). Can anyone please tell what I am missing in my code?

class Program
{
    public void Try1() {
        HtmlWeb web = new HtmlWeb();
        HttpStatusCode statusCode = HttpStatusCode.OK;

        web.UserAgent = GetUserAgent();

        web.PostResponse = (request, response) =>
        {
            if (response != null)
            {
                statusCode = response.StatusCode;
                Console.WriteLine("Status Code: " + statusCode);
            }
        };

        Task<HtmlDocument> task = web.LoadFromWebAsync(GetURL());
        HtmlDocument document = task.Result;
    }

    public void Try2() {
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(GetURL());
        request.UserAgent = GetUserAgent();
        request.Accept= "*/*;";

        using (var response = (HttpWebResponse)(request.GetResponse()))
        {
            HttpStatusCode code = response.StatusCode;
            if (code == HttpStatusCode.OK)
            {
                using (StreamReader streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8))
                {
                    HtmlDocument htmlDoc = new HtmlDocument();
                    htmlDoc.OptionFixNestedTags = true;
                    htmlDoc.Load(streamReader);
                    Console.WriteLine("Document Loaded.");
                }
            }
        }
    }

    private string GetURL() {
        // return "https://html-agility-pack.net/";
        return "https://www.nseindia.com/";
    }

    private string GetUserAgent() {
        return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36";
    }
}

Upvotes: 0

Views: 551

Answers (1)

Alfred Luu
Alfred Luu

Reputation: 2028

Your are lack of headers towards Accept and others so it couldn't response back. Besides that, I would recommend you using HttpClient instead of HttpWebRequest

public static async Task GetHtmlData(string url)
{
    HttpClient httpClient = new HttpClient();
    using (var request = new HttpRequestMessage(HttpMethod.Get, new Uri(url)))
    {
        request.Headers.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml, charset=UTF-8, text/javascript, */*; q=0.01");
        request.Headers.TryAddWithoutValidation("Accept-Encoding", "gzip, deflate, br");
        request.Headers.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.137");
        request.Headers.TryAddWithoutValidation("Accept-Charset", "ISO-8859-1");
        request.Headers.TryAddWithoutValidation("X-Requested-With", "XMLHttpRequest");

        using (var response = await httpClient.SendAsync(request).ConfigureAwait(false))
        {
            response.EnsureSuccessStatusCode();
            using (var responseStream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false))
            using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress))
            using (var streamReader = new StreamReader(decompressedStream))
            {
                var result = await streamReader.ReadToEndAsync().ConfigureAwait(false);

                HtmlDocument htmlDoc = new HtmlDocument();
                htmlDoc.OptionFixNestedTags = true;
                htmlDoc.LoadHtml(result);
                Console.WriteLine(result);
                Console.WriteLine("Document Loaded.");
            }
        }
    }

Use it by

await GetHtmlData("https://www.nseindia.com/");

Upvotes: 2

Related Questions