Reputation: 1481
Download html page from some URL
Download images that were mentioned in html tags.
Change tags for images in my file, so I can open it with my browser offline and see them.
I made first 2 points, but am having difficulties with the third one.Tags do not change.What am I doing wrong?
The job is to open a file, find img src tag and replace it by another tag! Can you give me an example?
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.awt.image.BufferedImage;
import java.net.URL;
import java.net.URLConnection;
import javax.imageio.ImageIO;
import javax.swing.text.AttributeSet;
import javax.swing.text.html.HTMLDocument;
public class ExtractAllImages {
static String result_doc = "/home/foo/index.html";
static String home_folder = "/home/foo/";
static String start_webURL = "http://www.oracle.com/";
public static void main(String args[]) throws Exception {
String webUrl = start_webURL;
URL url = new URL(webUrl);
URLConnection connection = url.openConnection();
InputStream is = connection.getInputStream();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
FileWriter writer = new FileWriter(result_doc);
htmlKit.write(writer, htmlDoc, 0, htmlDoc.getLength());
writer.close();
int number_or_images = 0;
String[] array = new String[4096];
for (HTMLDocument.Iterator iterator = htmlDoc.getIterator(HTML.Tag.IMG); iterator.isValid(); iterator.next()) {
AttributeSet attributes = iterator.getAttributes();
String imgSrc = (String) attributes.getAttribute(HTML.Attribute.SRC);
System.out.println("img_src = " + imgSrc);
if (imgSrc != null && (imgSrc.endsWith(".jpg") || (imgSrc.endsWith(".png")) || (imgSrc.endsWith(".jpeg")) || (imgSrc.endsWith(".bmp")) || (imgSrc.endsWith(".ico")))) {
try {
downloadImage(webUrl, imgSrc);
} catch (IOException ex) {
System.out.println(ex.getMessage());
}
}
array[number_or_images] = imgSrc;
number_or_images++;
///TODO change
}
for(int i =0; i < number_or_images; i++)
{
System.out.println("before = "+array[i]);
while(true)
{
int count = array[i].indexOf('/');
if(count == -1) break;
array[i] = array[i].substring(count+1);
}
System.out.println("after = " + array[i]);
}
//TODO open file and replace tags
int i =0;
File input = new File(result_doc);
Document doc = Jsoup.parse(input, "UTF-8");
System.out.println( input.canWrite());
for( Element img : doc.select("img[src]") )
{
String s = img.attr("src");
System.out.println(s);
img.attr("src", "/home/foo/"+array[i]); // set attribute 'src' to 'your-source-here'
s = img.attr("src");
System.out.println(s);
++i;
}
}
private static void downloadImage(String url, String imgSrc) throws IOException {
BufferedImage image = null;
try {
if (!(imgSrc.startsWith("http"))) {
url = url + imgSrc;
} else {
url = imgSrc;
}
imgSrc = imgSrc.substring(imgSrc.lastIndexOf("/") + 1);
String imageFormat = null;
imageFormat = imgSrc.substring(imgSrc.lastIndexOf(".") + 1);
String imgPath = null;
imgPath = home_folder + imgSrc + "";
URL imageUrl = new URL(url);
image = ImageIO.read(imageUrl);
if (image != null) {
File file = new File(imgPath);
ImageIO.write(image, imageFormat, file);
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
Upvotes: 2
Views: 7136
Reputation: 1481
Solved. I didn't save changes. Need to add code befire "downloadImage()"
int i = 0;
File input = new File(result_doc);
Document doc = Jsoup.parse(input, "UTF-8");
for( Element img : doc.select("img[src]") ) {
img.attr("src",array[i]); // set attribute 'src' to 'your-source-here'
++i;
}
try {
String strmb = doc.outerHtml();
bw = new BufferedWriter(new FileWriter(result_doc));
bw.write(strmb);
bw.close();
}
catch (Exception ex) {
System.out.println("Program stopped. The problem is " + "\"" +
ex.getMessage()+"\"");
}
Upvotes: 3
Reputation: 1039
You can go with JSOUP Try something like below
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public static void getAllTags(){
try {
File input=new File("H:\\html pages\\index1.html");
Document document=Jsoup.parse(input, "UTF-8");
Document parse=Jsoup.parse(document.html());
Elements body=parse.select("body");
Elements bodyTags=body.select("*");
for (Element element : bodyTags) {
//Do what you want with tag
System.out.println(element.tagName());
}
} catch (Exception e) {
e.printStackTrace();
}
If you want to parse html then try this
public static void parseHTML(){
try {
File input = new File("H:\\html\\index1.html");
Document document = Jsoup.parse(input, "UTF-8");
Document parse = Jsoup.parse(document.html());
Elements bodyElements = parse.select("div");
Elements elements = bodyElements.select("*");
for (Element element : elements) {
FilterHtml.setHtmlTAG(element.tagName());
FilterHtml.ParseXml();
Elements body = bodyElements.select(FilterHtml.getXmlTAG());
if (body.is(FilterHtml.getXmlTAG())) {
Elements tag = parse.select(FilterHtml.getXmlTAG());
//Do something meaning full with tag
System.out.println(tag.text());
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
Hope this would help. if yes please mark it green.
Upvotes: 1