Reputation: 9129
Aim: To download a website source with using a console application. You can find the used class in the program below.
Question: I use the code below to download a data (source) of a web page. Imagine you use chrome; If you enter first this query string, the web page itself redirects you a view HTML page and you see the data.
www.xyz.com/aaa.html?search=aaa&id=1
In an explorer, It works fine . I see 4 HTML tables inside the page when I use google chrome view source option. Bu in my application I see only two tables of the 4 . The two tables inside the web page is missing.(the missing two tables are the second and third.)
How can I overcome to this problem? I want to get the source of the page as I see in chrome.
Bonus informations: There is no iframe.
The particular Code :
string url = "www.xyz.com/aaa.html?search=aaa&id=1";
WebPage pG = ss.RequestPage(url, "", "GET");
pG = ss.RequestPage("www.xyz.com/ViewResult.html");
string source= pG.Html;
public WebPage RequestPage(Uri url, string content, string method, string contentType)
{
string htmlResult;
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
HttpWebResponse response = null;
ASCIIEncoding encoding = new ASCIIEncoding();
byte[] contentData = encoding.GetBytes(content);
request.Proxy = Proxy;
request.Timeout = 60000;
request.Method = method;
request.AllowAutoRedirect = false; // false
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
request.Referer = LastUrl;
request.KeepAlive = true; //false,
request.UserAgent = UserAgent;
request.Headers.Add("Accept-Language", "en-us,en;q=0.5");
//request.Headers.Add("UA-CPU", "x86");
request.Headers.Add("Cache-Control", "no-cache");
request.Headers.Add("Accept-Encoding", "gzip,deflate");
String cookieString = "";
foreach (KeyValuePair<String, String> cookiePair in Cookies)
cookieString += cookiePair.Key + "=" + cookiePair.Value + ";";
if (cookieString.Length > 2)
{
String cookie = cookieString.Substring(0, cookieString.Length - 1);
request.Headers.Add("Cookie", cookie);
}
if (method == "POST")
{
request.ContentLength = contentData.Length;
request.ContentType = contentType;
Stream contentWriter = request.GetRequestStream();
contentWriter.Write(contentData, 0, contentData.Length);
contentWriter.Close();
}
int attempts = 0;
while (true)
{
try
{
response = (HttpWebResponse)request.GetResponse();
if (response == null)
throw new WebException();
break;
}
catch (WebException)
{
if (response != null)
response.Close();
if (attempts == PageReattempts)
{
throw;
}
else { }
// Wait three seconds before trying again
Thread.Sleep(3000);
}
attempts += 1;
}
// Tokenize cookies
if (response.Headers["Set-Cookie"] != null)
{
String headers = response.Headers["Set-Cookie"].Replace("path=/,", ";").Replace("HttpOnly,", "");
foreach (String cookie in headers.Split(';'))
{
if (cookie.Contains("="))
{
String[] splitCookie = cookie.Split('=');
String cookieKey = splitCookie[0].Trim();
String cookieValue = splitCookie[1].Trim();
if (Cookies.ContainsKey(cookieKey))
Cookies[cookieKey] = cookieValue;
else
Cookies.Add(cookieKey, cookieValue);
}
else
{
if (Cookies.ContainsKey(cookie))
Cookies[cookie] = "";
else
Cookies.Add(cookie, "");
}
}
}
htmlResult = ReadResponseStream(response);
response.Close();
if (response.Headers["Location"] != null)
{
response.Close();
Thread.Sleep(1500);
String newLocation = response.Headers["Location"];
WebPage result = RequestPage(newLocation);
return new WebPage(result.Html, new WebPage(htmlResult));
}
LastUrl = url.ToString();
return new WebPage(htmlResult);
}
Upvotes: 3
Views: 2681
Reputation: 1296
1-WebBrowser :
public class ExtendedWebBrowser : System.Windows.Forms.WebBrowser
{
public ExtendedWebBrowser()
{
// Ensure that ScriptErrorsSuppressed is set to false.
this.ScriptErrorsSuppressed = true;
this.ProgressChanged += ExtendedWebBrowser_ProgressChanged;
}
private void ExtendedWebBrowser_ProgressChanged(object sender, WebBrowserProgressChangedEventArgs e)
{
// InjectAlertBlocker();
string alertBlocker = @"window.alert = function () { };
window.print = function () { };
window.open = function () { };
window.onunload = function () { };
window.onbeforeunload = function () { };";
var webBrowser = sender as WebBrowser;
webBrowser?.Document?.InvokeScript("execScript", new Object[] { alertBlocker, "JavaScript" });
this.Document?.InvokeScript("execScript", new Object[] { alertBlocker, "JavaScript" });
}
public void NavigationWaitToComplete(string url)
{
bool complete = false;
NavigationAsync(url).ContinueWith((t) => complete = true);
while (!complete)
{
System.Windows.Forms.Application.DoEvents();
}
}
public void NavigationWaitToComplete(string url, string targetFrameName, byte[] postData, string additionalHeaders)
{
bool complete = false;
NavigationAsync(url, targetFrameName, postData, additionalHeaders).ContinueWith((t) => complete = true);
while (!complete)
{
System.Windows.Forms.Application.DoEvents();
}
}
public async Task NavigationAsync(string url, string targetFrameName, byte[] postData, string additionalHeaders)
{
TaskCompletionSource<bool> tcsNavigation = new TaskCompletionSource<bool>(); ;
TaskCompletionSource<bool> tcsDocument = new TaskCompletionSource<bool>(); ;
Navigated += (s, e) =>
{
if (tcsNavigation.Task.IsCompleted)
return;
tcsNavigation.SetResult(true);
};
DocumentCompleted += (s, e) =>
{
if (ReadyState != WebBrowserReadyState.Complete)
return;
if (tcsDocument.Task.IsCompleted)
return;
tcsDocument.SetResult(true);
};
Navigate(url, targetFrameName, postData, additionalHeaders);
await tcsNavigation.Task;
// navigation completed, but the document may still be loading
await tcsDocument.Task;
// the document has been fully loaded, you can access DOM here
}
public async Task NavigationAsync(string url)
{
TaskCompletionSource<bool> tcsNavigation = new TaskCompletionSource<bool>(); ;
TaskCompletionSource<bool> tcsDocument = new TaskCompletionSource<bool>(); ;
Navigated += (s, e) =>
{
if (tcsNavigation.Task.IsCompleted)
return;
tcsNavigation.SetResult(true);
};
DocumentCompleted += (s, e) =>
{
if (ReadyState != WebBrowserReadyState.Complete)
return;
if (tcsDocument.Task.IsCompleted)
return;
tcsDocument.SetResult(true);
};
Navigate(url);
await tcsNavigation.Task;
// navigation completed, but the document may still be loading
await tcsDocument.Task;
// the document has been fully loaded, you can access DOM here
}
}
Calling:
var browser = new ExtendedWebBrowser();
browser.NavigationWaitToComplete("www.xyz.com/aaa.html?search=aaa&id=1");
var html = browser.Document.Body.OuterHtml();
2-CefSharp.OffScreen
private async Task<string> RequestPageAsync(string url, string cachePath, double zoomLevel)
{
var tcs = new TaskCompletionSource<string>();
var browserSettings = new BrowserSettings();
//Reduce rendering speed to one frame per second so it's easier to take screen shots
browserSettings.WindowlessFrameRate = 1;
var requestContextSettings = new RequestContextSettings { CachePath = cachePath };
// RequestContext can be shared between browser instances and allows for custom settings
// e.g. CachePath
using (var requestContext = new RequestContext(requestContextSettings))
using (var browser = new ChromiumWebBrowser(url, browserSettings, requestContext))
{
if (zoomLevel > 1)
{
browser.FrameLoadStart += (s, argsi) =>
{
var b = (ChromiumWebBrowser)s;
if (argsi.Frame.IsMain)
{
b.SetZoomLevel(zoomLevel);
}
};
}
browser.FrameLoadEnd += (s, argsi) =>
{
var b = (ChromiumWebBrowser)s;
if (argsi.Frame.IsMain)
{
b.GetSourceAsync().ContinueWith(taskHtml =>
{
tcs.TrySetResult(taskHtml.Result);
});
}
};
}
return tcs.Task.Result;
}
Calling :
RequestPageAsync("www.xyz.com/aaa.html?search=aaa&id=1", "cachePath1", 1.0);
Upvotes: 2