
Reputation: 1827

convert a word documents to HTML with embedded images by TIKA

I'm new in TIKA. I try to convert Microsoft word documents to HTML by using Tika. I'm using TikaOnDotNet wrapper to used TIKA on .Net framework. My conversion code is like following:

        byte[] file = Files.toByteArray(new File(@"myPath\document.doc"));
        AutoDetectParser tikaParser = new AutoDetectParser();

        ByteArrayOutputStream output = new ByteArrayOutputStream();
        SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        handler.setResult(new StreamResult(output));

        ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);

        tikaParser.parse(new ByteArrayInputStream(file), handler1, new Metadata());

        File ofile = new File(@"C:\toHtml\text.html");
        DataOutputStream stream = new DataOutputStream(new FileOutputStream(ofile));

everything working well except the embedded images. The generated HTML contains image tag like:

<img src="embedded:image2.wmf" alt="image2.wmf"/>

but the image source does not exists. Please advise me

Upvotes: 3

Views: 2842

Answers (1)


Reputation: 1827

Credits goes to @Gagravarr.

please note that this is a simple implementation of code, the original codes are available in comment of the questions.

This implementation is based on TikaOnDotNet wrapper.....

public class DocToHtml

    private TikaConfig config = TikaConfig.getDefaultConfig();
    public void Convert()

        byte[] file = Files.toByteArray(new File(@"filename.doc"));
        AutoDetectParser tikaParser = new AutoDetectParser();

        ByteArrayOutputStream output = new ByteArrayOutputStream();
        SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance();
        var inputStream = new ByteArrayInputStream(file);
        //           ToHTMLContentHandler handler = new ToHTMLContentHandler();
        var metaData = new Metadata();
        EncodingDetector encodingDetector = new UniversalEncodingDetector();
        var encode = encodingDetector.detect(inputStream, metaData) ?? new UTF_32();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encode.toString());
        handler.setResult(new StreamResult(output));

        ContentHandler imageRewriting = new ImageRewritingContentHandler(handler); 

        //  ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
        ParseContext context = new ParseContext();
        context.set(typeof(EmbeddedDocumentExtractor), new FileEmbeddedDocumentEtractor());

        tikaParser.parse(inputStream, imageRewriting, new Metadata(), context);

        byte[] array =  output.toByteArray();

       System.IO.File.WriteAllBytes(@"C:\toHtml\text.html", array);


    private class ImageRewritingContentHandler : ContentHandlerDecorator
        public ImageRewritingContentHandler(ContentHandler handler) : base(handler)

        public override void startElement(string uri, string localName, string name, Attributes origAttrs)
            if ("img".Equals(localName))
                AttributesImpl attrs;
                if (origAttrs is AttributesImpl)
                    attrs = (AttributesImpl)origAttrs;
                    attrs = new AttributesImpl(origAttrs);

                for (int i = 0; i < attrs.getLength(); i++)
                    if ("src".Equals(attrs.getLocalName(i)))
                        String src = attrs.getValue(i);
                        if (src.StartsWith("embedded:"))
                            var newSrc = src.Replace("embedded:", @"images\");

                            attrs.setValue(i, newSrc);
                attrs.addAttribute(null, "width", "width","width", "100px");
                base.startElement(uri, localName, name, attrs);
                base.startElement(uri, localName, name, origAttrs);

    private class FileEmbeddedDocumentEtractor : EmbeddedDocumentExtractor
        private int count = 0;
        public bool shouldParseEmbedded(Metadata m)
            return true;

        public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, bool outputHtml)
            Detector detector = new DefaultDetector();
            string name = metadata.get("resourceName");
            MediaType contentType = detector.detect(inputStream, metadata);
            if (contentType.getType() != "image") return;
            var embeddedFile = name;
            File outputFile = new File(@"C:\toHtml\images", embeddedFile);
                using (FileOutputStream os = new FileOutputStream(outputFile))
                    var tin = inputStream as TikaInputStream;
                    if (tin != null)
                        if (tin.getOpenContainer() != null && tin.getOpenContainer() is DirectoryEntry)
                            POIFSFileSystem fs = new POIFSFileSystem();

                            IOUtils.copy(inputStream, os);
            catch (Exception ex)


Upvotes: 2

Related Questions