Reputation: 3956
I am new to Jsoup library. And this piece of HTML is giving me hell parsing iy.
Remote HTML
//Skipped the meta and header because I don't need it.
...
<body class="sin">
<div class="ks">
<div class="wrap">
<div class="content-right-sidebar-wrap">
<main class="content">
//A lot of unneeded tags
<article class="post-1989009 post type-post post" itemscope="" itemtype="http://schema.org/CreativeWork">
<header class="post-header">
<h1 class="post-title" itemprop="headline">Tyh RGB Marco to habits gtr</h1>
<img src="https://ohniee.com/wp-content/uploads/avatars/1/djsy8933e89ufio8389e8-author-img.jpg" class="avatar user-1-avatar avatar-40 photo" width="40" height="40" alt="Profile photo of Johnnie Adams">
<div class="entry-meta" style="padding-top:3px; margin-left: 50px">
" Written by "<a href="/authors/johnnie"><span class="entry-author" itemprop="author" itemscope="" itemtype="http://schema.org/Person"><span class="entry-author-name" itemprop="name">Johnnie Adams</span></span></a> <script>
document.write(" on April 23rd, 2002 11:28 PM")</script>" on April 23rd, 2002 11:28 PM . "<span class="entry-comments-link"><a href="https://johniee.com/2002/04/thalo-in-American-film-industryk.html#comments">1 Comment</a></span>
</div>
</header>
//A lot of unneeded tags
...
This how I am parsing it:
String post_authordate = document.select("div.entry-meta").first().text();
postAuthorDate.setText(post_authordate);
Elements img = document.select("img[class=avater]");
String author_image = img.attr("src");
postAuthorUrl.setText(author_image);
And this what I'm getting
postAuthorUrl
nothing
is showing.What I want
Written by Johnnie Adams on April 23rd, 2002 11:18 PM
https://ohniee.com/wp-content/uploads/avatars/1/djsy8933e89ufio8389e8-author-img.jpg
private void loadPost() {
Log.d(TAG, "loadPost called");
final ProgressBar progressBar;
progressBar = (ProgressBar) findViewById(R.id.progress_circle);
progressBar.setVisibility(View.VISIBLE);
String news_id = getIntent().getStringExtra("PostId");
Log.d(TAG, "You clicked post id " + news_id);
StringRequest stringRequest = new StringRequest(news_id,
new Response.Listener<String>() {
@Override
public void onResponse(String response) {
//Log.d("Debug", response.toString());
if (progressBar != null) {
progressBar.setVisibility(View.GONE);
}
parseHtml(response);
postData = response;
}
},
new Response.ErrorListener() {
@Override
public void onErrorResponse(VolleyError error) {
VolleyLog.d("", "Error: " + error.getMessage());
if (progressBar != null) {
progressBar.setVisibility(View.GONE);
}
final AlertDialog.Builder sthWrongAlert = new AlertDialog.Builder(PostDetails.this);
sthWrongAlert.setCancelable(false);
sthWrongAlert.setMessage(R.string.sth_wrongme_det);
sthWrongAlert.setPositiveButton(R.string.alert_retry, new DialogInterface.OnClickListener() {
@Override
public void onClick(DialogInterface dialog, int which) {
if (!NetworkCheck.isAvailableAndConnected(PostDetails.this)) {
internetDialog.show();
} else {
loadPost();
}
}
});
sthWrongAlert.setNegativeButton(R.string.alert_cancel, new DialogInterface.OnClickListener() {
@Override
public void onClick(DialogInterface dialog, int which) {
finish();
}
});
sthWrongAlert.show();
}
});
//Creating requestqueue
RequestQueue requestQueue = Volley.newRequestQueue(this);
//Adding request queue
requestQueue.add(stringRequest);
}
private void parseHtml(String response) {
Log.d(TAG, "parsinghtml");
Document document = Jsoup.parse(response);
String post_authordate = document.select("div.entry-meta").get(0).text();
String img = document.select("img.avatar").get(0).attr("src");
postAuthorDate.setText(post_authordate);
}
Upvotes: 0
Views: 218
Reputation: 2541
Try this
This is how I read & parse the file with html content
public static String readFile(String path, Charset encoding) throws IOException {
byte[] encoded = Files.readAllBytes(Paths.get(path));
return new String(encoded, encoding);
}
public void jsouptest() throws IOException {
// there is a file named "sample.html" in the project directory
String htmlContent = readFile("sample.html", StandardCharsets.UTF_8);
Document doc = Jsoup.parseBodyFragment(htmlContent, "");
Element body = doc.body();
String entryMetaText = body.select("div.entry-meta").get(0).text();
String imgLink = body.select("img.avatar").get(0).attr("src");
int commentPos = -1;
if(entryMetaText.contains("AM")){
commentPos = entryMetaText.indexOf("AM") + 2;
}
else if(entryMetaText.contains("PM")){
commentPos = entryMetaText.indexOf("PM") + 2;
}
if(commentPos != -1)
{
entryMetaText = entryMetaText.substring(0, commentPos);
}
System.out.println(entryMetaText);
System.out.println(imgLink);
}
Output (Edited for the new sample html):
" Written by "Johnnie Adams " on April 23rd, 2002 11:28 PM
https://ohniee.com/wp-content/uploads/avatars/1/djsy8933e89ufio8389e8-author-img.jpg
The first line of output strips anything after PM so extra full stop and quotation mark is missing at the end.
Upvotes: 1
Reputation: 156
Try img[class~=avatar user-(\d+)-avatar avatar-40 photo]
instead of img[class=avater]
The date in the html source is 2002. You want 2016?
System.out.println(entryMetaText.replaceAll("\d+ Comment",""));
System.out.println(entryMetaText.substring(0, entryMetaText.length() - 9);)
Upvotes: 1