Reputation: 83
I Want to develope an app I give Url of a specific website to it,and it extract all links from that Web page. For each extracted link I want to get the HTML content. I am based in the concept of deep crawling. My purpose is to get all email addresses of website. Below is my source code:
static string ExtractEmails(string data)
{
//instantiate with this pattern
Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
//find items that matches with our pattern
MatchCollection emailMatches = emailRegex.Matches(data);
//StringBuilder sb = new StringBuilder();
string s = "";
foreach (Match emailMatch in emailMatches)
{
//sb.AppendLine(emailMatch.Value);
s += emailMatch.Value + ",";
}
return s;
}
static readonly List<ParsResult> _results = new List<ParsResult>();
static Int32 _maxDepth = 4;
static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
{
string email = "";
if (depth >= _maxDepth) return email;
String html;
using (var wc = new WebClient())
html = wc.DownloadString(urlToCheck ?? parent.Url);
var doc = new HtmlDocument();
doc.LoadHtml(html);
var aNods = doc.DocumentNode.SelectNodes("//a");
if (aNods == null || !aNods.Any()) return email;
foreach (var aNode in aNods)
{
var url = aNode.Attributes["href"];
if (url == null)
continue;
var wc2 = new WebClient();
String html2 = wc2.DownloadString(url.Value);
email = ExtractEmails(html2);
Console.WriteLine(email);
var result = new ParsResult
{
Depth = depth,
Parent = parent,
Url = url.Value
};
_results.Add(result);
Console.WriteLine("{0} - {1}", depth, result.Url);
Foo(depth: depth + 1, parent: result);
return email;
}
return email;
}
static void Main(string[] args)
{
String res = Foo("http://www.mobileridoda.com", 0);
Console.WriteLine("emails " + res);
}
I want to dispaly in console all emails extracted by all pages of all links that are inside DOM of Main page, But it dispalys no emails in console. How can I solve this issue ? Thank you
Upvotes: 2
Views: 1018
Reputation: 11364
Found a few things wrong but no worries, got the details on why and what to do to fix them.
In your foreach loop, when you go through the first URL, you are using a return statement at the end essentially breaking the loop and terminating. Use return only after you have processed ALL the URLs and accumulated the email addresses.
You are overwriting the email (i see it as a csv) when you go over the loop. Use += to continue adding. email = ExtractEmails(html2);
You are not returning anything when you call Foo within your forEach loop. You need to use email += Foo(xyz). Foo(depth: depth + 1, parent: result);
You are going through a URL that you have already processed... possibly causing an infinite cycle. I added a list of strings that keeps track of URLs you have already visited so as to prevent the infinite loop you might get into.
Here is a complete working solution.
static string ExtractEmails(string data)
{
//instantiate with this pattern
Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
//find items that matches with our pattern
MatchCollection emailMatches = emailRegex.Matches(data);
//StringBuilder sb = new StringBuilder();
string s = "";
foreach (Match emailMatch in emailMatches)
{
//sb.AppendLine(emailMatch.Value);
s += emailMatch.Value + ",";
}
return s;
}
static readonly List<ParsResult> _results = new List<ParsResult>();
static Int32 _maxDepth = 4;
static List<string> urlsAlreadyVisited = new List<string>();
static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
{
if (urlsAlreadyVisited.Contains(urlToCheck))
return string.Empty;
else
urlsAlreadyVisited.Add(urlToCheck);
string email = "";
if (depth >= _maxDepth) return email;
String html;
using (var wc = new WebClient())
html = wc.DownloadString(urlToCheck ?? parent.Url);
var doc = new HtmlDocument();
doc.LoadHtml(html);
var aNods = doc.DocumentNode.SelectNodes("//a");
if (aNods == null || !aNods.Any()) return email;
// Get Distinct URLs from all the URls on this page.
List<string> allUrls = aNods.ToList().Select(x => x.Attributes["href"].Value).Where(url => url.StartsWith("http")).Distinct().ToList();
foreach (string url in allUrls)
{
var wc2 = new WebClient();
try
{
email += ExtractEmails(wc2.DownloadString(url));
}
catch { /* Swallow Exception ... URL not found or other errors. */ continue; }
Console.WriteLine(email);
var result = new ParsResult
{
Depth = depth,
Parent = parent,
Url = url
};
_results.Add(result);
Console.WriteLine("{0} - {1}", depth, result.Url);
email += Foo(depth: depth + 1, parent: result);
}
return email;
}
public class ParsResult
{
public int Depth { get; set; }
public ParsResult Parent { get; set; }
public string Url { get; set; }
}
// ========== MAIN CLASS ==========
static void Main(string[] args)
{
String res = Foo("http://www.mobileridoda.com", 0);
Console.WriteLine("emails " + res);
}
Upvotes: 1