Reputation: 184
I was scraping data from a webpage using HtmlAgilityPack with this code
string Name= "ARKU2215462";
string containerInfo = LoadContent(Name);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(containerInfo);
doc.OptionEmptyCollection = true;
HtmlNode[] nodes = doc.DocumentNode
.SelectNodes("//td[@style='padding:7px']")
.ToArray();
if (nodes != null)
{
foreach (HtmlNode item in nodes)
{
Console.WriteLine(item.InnerHtml);
string[] akla = nodes[1].InnerHtml.ToString().Split('-');
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosAKLA, akla[0]);
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosEidous, akla[1]);
string[] date = nodes[3].InnerHtml.ToString().Split();
emporevmatokibotiaGridView.SetRowCellValue(i, colEtosAKLA, DateTime.Now.Year);
if (akla[0].IsNullOrEmpty())
{
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosAKLA, "ΔΕΝ ΕΚΦΟΡΤΩΘΗΚΕ");
}
}
if (emporevmatokibotiaGridView.GetRowCellValue(i, colArithmosAKLA).IsNull())
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosAKLA, "ΔΕΝ ΕΚΦΟΡΤΩΘΗΚΕ");
}
else
{
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosAKLA, "ΔΕΝ ΕΚΦΟΡΤΩΘΗΚΕ");
}
}
private static string LoadContent(string reference)
{
string url = $"https://portal.thpa.gr/fnet5/track/index.php";
var hc = new HttpClient();
var reqUrlContent =
hc.PostAsync(url,
new StringContent($"d=1&containerCode={reference}&go=1", Encoding.UTF8,
"application/x-www-form-urlencoded"))
.Result;
Stream stream = reqUrlContent.Content.ReadAsStreamAsync().Result;
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.Load(stream);
return doc.DocumentNode.InnerHtml;
}
Now the webpage has changed completely and I cannot retrieve the data as before.
I found a solution thanks to Victor Scrape data from web page with HtmlAgilityPack c# but my problem is that I altough I have a string containing the webpage, I cannot retrieve the data I want.
The code I am using is
string url = $"https://webportal.thpa.gr/ctreport/container/track";
var html = Download("ARKU2215462");
HtmlDocument doc = new HtmlDocument();
//doc.Load(stream);
//doc.Load(html);
doc.LoadHtml(html);
foreach (HtmlNode item in doc.DocumentNode.ChildNodes)
{
Console.WriteLine(item.InnerHtml);
}
public static string Download(string search)
{
var request = (HttpWebRequest)WebRequest.Create("https://webportal.thpa.gr/ctreport/container/track");
var postData = string.Format("report_container%5Bcontainerno%5D={0}&report_container%5Bsearch%5D=", search);
var data = Encoding.ASCII.GetBytes(postData);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = data.Length;
using (var stream = request.GetRequestStream())
{
stream.Write(data, 0, data.Length);
}
using (var response = (HttpWebResponse)request.GetResponse())
using (var stream = new StreamReader(response.GetResponseStream()))
{
return stream.ReadToEnd();
}
}
The information is inside the innerHtml but I cannot get it as information inside the variables that I want.
> <tbody> <tr> <td>ARKU2215462</td> <td>2022000648-358</td>
> <td>DISCHARGE</td> <td>2022-05-26 04:42:20</td> <td> </td> </tr>
> <tr><td>ARKU2215462</td> <td>2022000648-358</td> <td>COLLECT</td>
> <td>2022-05-27 20:38:23</td> <td></td> </tr> </tbody>
In the old code I used
HtmlNode[] nodes = doc.DocumentNode
.SelectNodes("//td[@style='padding:7px']")
.ToArray();
and the webpage returned only the last state now it returns all states and I want the last one and I think that htmlAgilityPack doesnt get the string as html so I can use the DocumentNode
Upvotes: 0
Views: 508
Reputation: 2371
Yes, the page changed it's HTML but is easy to solve:
var rows = doc.DocumentNode
.SelectNodes("//div[contains(@class,'card-body')]//tr")
.LastOrDefault()?
.SelectNodes(".//td")
.ToArray();
The first SelectNodes
select the unique card-body
in the page, that div contains the table. We select all the rows (tr
) inside that card.
You want only the last row, so with LastOrDefault()
we get that row and we select the cells (td
) of that row.
If you use the dev tools of the browser (F12) you can easily discover the nodes to select for your XPATH and adapt quickly your code to the changes.
Upvotes: 1