vax
vax

Reputation: 83

Extract email address from a website for each link inside DOM of page

I Want to develope an app I give Url of a specific website to it,and it extract all links from that Web page. For each extracted link I want to get the HTML content. I am based in the concept of deep crawling. My purpose is to get all email addresses of website. Below is my source code:

 static string ExtractEmails(string data)
 {

            //instantiate with this pattern 
            Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
            //find items that matches with our pattern
            MatchCollection emailMatches = emailRegex.Matches(data);

            //StringBuilder sb = new StringBuilder();
            string s = "";
            foreach (Match emailMatch in emailMatches)
            {
                //sb.AppendLine(emailMatch.Value);
                s += emailMatch.Value + ",";
            }
            return s;
 }

     static readonly List<ParsResult> _results = new List<ParsResult>();
        static Int32 _maxDepth = 4;
        static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
        {
            string email = "";
            if (depth >= _maxDepth) return email;
            String html;
            using (var wc = new WebClient())
                html = wc.DownloadString(urlToCheck ?? parent.Url);

            var doc = new HtmlDocument();
            doc.LoadHtml(html);
            var aNods = doc.DocumentNode.SelectNodes("//a");
            if (aNods == null || !aNods.Any()) return email;
            foreach (var aNode in aNods)
            {
                var url = aNode.Attributes["href"];
                if (url == null)
                    continue;

                var wc2 = new WebClient();
                String html2 = wc2.DownloadString(url.Value);
                email = ExtractEmails(html2);
                Console.WriteLine(email);
                var result = new ParsResult
                {
                    Depth = depth,
                    Parent = parent,
                    Url = url.Value
                };
                _results.Add(result);
                Console.WriteLine("{0} - {1}", depth, result.Url);
                Foo(depth: depth + 1, parent: result);
                return email;
            }
            return email;
        }

static void Main(string[] args)
{
    String res = Foo("http://www.mobileridoda.com", 0);
    Console.WriteLine("emails " + res);
}

I want to dispaly in console all emails extracted by all pages of all links that are inside DOM of Main page, But it dispalys no emails in console. How can I solve this issue ? Thank you

Upvotes: 2

Views: 1018

Answers (1)

Jawad
Jawad

Reputation: 11364

Found a few things wrong but no worries, got the details on why and what to do to fix them.

  1. In your foreach loop, when you go through the first URL, you are using a return statement at the end essentially breaking the loop and terminating. Use return only after you have processed ALL the URLs and accumulated the email addresses.

  2. You are overwriting the email (i see it as a csv) when you go over the loop. Use += to continue adding. email = ExtractEmails(html2);

  3. You are not returning anything when you call Foo within your forEach loop. You need to use email += Foo(xyz). Foo(depth: depth + 1, parent: result);

  4. You are going through a URL that you have already processed... possibly causing an infinite cycle. I added a list of strings that keeps track of URLs you have already visited so as to prevent the infinite loop you might get into.

Here is a complete working solution.

    static string ExtractEmails(string data)
    {
        //instantiate with this pattern 
        Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
        //find items that matches with our pattern
        MatchCollection emailMatches = emailRegex.Matches(data);

        //StringBuilder sb = new StringBuilder();
        string s = "";
        foreach (Match emailMatch in emailMatches)
        {
            //sb.AppendLine(emailMatch.Value);
            s += emailMatch.Value + ",";
        }
        return s;
    }

    static readonly List<ParsResult> _results = new List<ParsResult>();
    static Int32 _maxDepth = 4;
    static List<string> urlsAlreadyVisited = new List<string>();

    static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
    {
        if (urlsAlreadyVisited.Contains(urlToCheck))
            return string.Empty;
        else
            urlsAlreadyVisited.Add(urlToCheck);

        string email = "";
        if (depth >= _maxDepth) return email;
        String html;
        using (var wc = new WebClient())
            html = wc.DownloadString(urlToCheck ?? parent.Url);

        var doc = new HtmlDocument();
        doc.LoadHtml(html);
        var aNods = doc.DocumentNode.SelectNodes("//a");
        if (aNods == null || !aNods.Any()) return email;

        // Get Distinct URLs from all the URls on this page.
        List<string> allUrls = aNods.ToList().Select(x => x.Attributes["href"].Value).Where(url => url.StartsWith("http")).Distinct().ToList();

        foreach (string url in allUrls)
        {
            var wc2 = new WebClient();
            try
            {
                email += ExtractEmails(wc2.DownloadString(url));
            }
            catch { /* Swallow Exception ... URL not found or other errors. */ continue; }

            Console.WriteLine(email);
            var result = new ParsResult
            {
                Depth = depth,
                Parent = parent,
                Url = url
            };
            _results.Add(result);
            Console.WriteLine("{0} - {1}", depth, result.Url);
            email += Foo(depth: depth + 1, parent: result);
        }
        return email;
    }
    public class ParsResult
    {
        public int Depth { get; set; }
        public ParsResult Parent { get; set; }
        public string Url { get; set; }
    }

    // ========== MAIN CLASS ==========

    static void Main(string[] args)
    {
        String res = Foo("http://www.mobileridoda.com", 0);
        Console.WriteLine("emails " + res);
    }

Upvotes: 1

Related Questions