Extracting url links within a downloaded txt file

Question

Currently working on a url extractor for work. I'm trying to extract all http links/ href links from a downloaded html file and print the links on there own in a separate txt file.So far I've managed to get the entire html of a page downloaded its just extracting the links from it and printing them using Regex is a problem. Wondering if anyone could help me with this?

     private void button2_Click(object sender, EventArgs e)
    {
        Uri fileURI = new Uri(URLbox2.Text);

        WebRequest request = WebRequest.Create(fileURI);
        request.Credentials = CredentialCache.DefaultCredentials;
        WebResponse response = request.GetResponse();
        Console.WriteLine(((HttpWebResponse)response).StatusDescription);
        Stream dataStream = response.GetResponseStream();
        StreamReader reader = new StreamReader(dataStream);
        string responseFromServer = reader.ReadToEnd();

        SW = File.CreateText("C:\Users\Conal_Curran\OneDrive\C#\MyProjects\Web Crawler\URLTester\response1.htm");
        SW.WriteLine(responseFromServer);

        SW.Close();

        string text = System.IO.File.ReadAllText(@"C:\Users\Conal_Curran\OneDrive\C#\MyProjects\Web Crawler\URLTester\response1.htm");
        string[] links = System.IO.File.ReadAllLines(@"C:\Users\Conal_Curran\OneDrive\C#\MyProjects\Web Crawler\URLTester\response1.htm");



        Regex regx = new Regex(links, @"http://([\w+?\.\w+])+([a-zA-Z0-9\~\!\@\#\$\%\^\&\*_\-\=\+\\\/\?\.\:\;\'\,]*)?", RegexOptions.IgnoreCase);

        MatchCollection mactches = regx.Matches(text);

        foreach (Match match in mactches)
        {
            text = text.Replace(match.Value, "" + match.Value + "");
        }

        SW = File.CreateText("C:\Users\Conal_Curran\OneDrive\C#\MyProjects\Web Crawler\URLTester\Links.htm");
        SW.WriteLine(links);
    }

Extracting url links within a downloaded txt file

Answers (1)

Related Questions