S-IT Java
S-IT Java

Reputation: 127

How to export embeded file which from excel using POI?

I have made a java basic program written below, which is making 3 kind of files (ppt,doc,txt) embedded in Excel sheet using Apache POI. Now this file I want to Export with its original format. How to do this?

Reference link is Embed files into Excel using Apache POI. I have made program from this link.

In short I want export functionality on Embedded file.

I have tried above problem using give below code but it not working for exporting embedded file in excel sheet:

Here this is the code which is tried to solve:

public static void main(String[] args) throws IOException {
    String fileName = "ole_ppt_in_xls.xls";
    ReadExcel(fileName);
}

 public static void ReadExcel(String fileName) throws IOException {
    FileInputStream inputFileStream = new FileInputStream(fileName);

    POIFSFileSystem fs = new POIFSFileSystem(inputFileStream);
    HSSFWorkbook workbook = new HSSFWorkbook(fs);

    for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) {
        // the OLE2 Class Name of the object
        String oleName = obj.getOLE2ClassName();
        System.out.println(oleName);
        if (oleName.equals("Worksheet")) {
            System.out.println("Worksheet");
            DirectoryNode dn = (DirectoryNode) obj.getDirectory();
            HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false);

        } else if (oleName.equals("Document")) {
            System.out.println("Document");
            DirectoryNode dn = (DirectoryNode) obj.getDirectory();
            HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs);
        } else if (oleName.equals("Presentation")) {
            System.out.println("Presentation");
            DirectoryNode dn = (DirectoryNode) obj.getDirectory();
            SlideShow embeddedPowerPointDocument = new SlideShow(
                    new HSLFSlideShow(dn, fs));
        } else if (oleName.equals("Presentation")) {
            System.out.println("Presentation");
            DirectoryNode dn = (DirectoryNode) obj.getDirectory();
            SlideShow embeddedPowerPointDocument = new SlideShow(
                    new HSLFSlideShow(dn, fs));
        }else {
            System.out.println("Else part ");
            if (obj.hasDirectoryEntry()) {
                System.out.println("obj.hasDirectoryEntry()"+obj.hasDirectoryEntry());
                // The DirectoryEntry is a DocumentNode. Examine its entries

                DirectoryNode dn = (DirectoryNode) obj.getDirectory();
                for (Iterator entries = dn.getEntries(); entries.hasNext();) {
                    Entry entry = (Entry) entries.next();
                    System.out.println(oleName + "." + entry.getName());
                }
            } else {
                System.out.println("Else part 22");
                byte[] objectData = obj.getObjectData();
            }
        }
    }

}

Output screen of above program:enter image description here

So ,how to exporting functionality implement?

Upvotes: 3

Views: 3074

Answers (4)

Swetha Prem
Swetha Prem

Reputation: 36

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;

/**
 * Demonstrates how you can extract embedded data from a .xlsx file
 */
public class GetEmbedded {

    public static void main(String[] args) throws Exception {
        String path = "SomeExcelFile.xlsx"
        XSSFWorkbook workbook = new XSSFWorkbook(new FileInputStream(new File(path)));

             for (PackagePart pPart : workbook.getAllEmbedds()) {
                            String contentType = pPart.getContentType();

                            if (contentType.equals("application/vnd.ms-excel")) { //This is to read xls workbook embedded to xlsx file
                                HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream());
                                int countOfSheetXls=embeddedWorkbook.getNumberOfSheets();

                 }
                            else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) { //This is to read xlsx workbook embedded to xlsx file
                                 if(pPart.getPartName().getName().equals("/xl/embeddings/Microsoft_Excel_Worksheet12.xlsx")){
                                 //"/xl/embeddings/Microsoft_Excel_Worksheet12.xlsx" - Can read an Excel from a particular sheet 
                                // This is the worksheet from the Parent Excel-sheet-12

                                     XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(pPart.getInputStream());
                                     int countOfSheetXlsx=embeddedWorkbook.getNumberOfSheets();
                                     ArrayList<String> sheetNames= new ArrayList<String>();
                                        for(int i=0;i<countOfSheetXlsx;i++){
                                        String name=workbook.getSheetName(i);
                                        sheetNames.add(name);
                                        }
                                }
                            }
                }
     }
}

Upvotes: 2

kiwiwings
kiwiwings

Reputation: 3446

To simplify the processing of embedded data, I've added an extractor class to POI, which will be available in POI 3.16-beta2 or a nightly until then.

The following will extract the objects of .xls/x files - all which is left, is to write the embedded bytes somewhere. It's possible to extend the extractor classes by simply extending EmbeddedExtractor and provide your own iterator() method.

import java.io.FileInputStream;
import java.io.InputStream;

import org.apache.poi.ss.extractor.EmbeddedData;
import org.apache.poi.ss.extractor.EmbeddedExtractor;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;

public class BlaExtract {
    public static void main(String[] args) throws Exception {
        InputStream fis = new FileInputStream("bla.xlsx");
        Workbook wb = WorkbookFactory.create(fis);
        fis.close();

        EmbeddedExtractor ee = new EmbeddedExtractor();
        for (Sheet s : wb) {
            for (EmbeddedData ed : ee.extractAll(s)) {
                System.out.println(ed.getFilename()+" ("+ed.getContentType()+") - "+ed.getEmbeddedData().length+" bytes");
            }
        }
        wb.close();
    }
}

Upvotes: 1

S-IT Java
S-IT Java

Reputation: 127

Required jar File List:

  1. commons-codec-1.10.jar
  2. dom4j.jar
  3. poi-3.16-beta1.jar
  4. poi-ooxml-3.8.jar
  5. poi-ooxml-schemas-3.9.jar
  6. poi-scratchpad-3.9.jar
  7. xmlbeans-2.3.0.jar

This is my Whole code implementation:

 import java.awt.Color;
 import java.awt.image.BufferedImage;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.Closeable;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.lang.reflect.Method;
 import java.net.URL;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import javax.imageio.ImageIO;
 import org.apache.poi.ddf.EscherComplexProperty;
 import org.apache.poi.ddf.EscherOptRecord;
 import org.apache.poi.ddf.EscherProperty;
 import org.apache.poi.hpsf.ClassID;
 import org.apache.poi.hslf.HSLFSlideShow;
 import org.apache.poi.hssf.usermodel.HSSFClientAnchor;
 import org.apache.poi.hssf.usermodel.HSSFObjectData;
 import org.apache.poi.hssf.usermodel.HSSFPatriarch;
 import org.apache.poi.hssf.usermodel.HSSFPicture;
 import org.apache.poi.hssf.usermodel.HSSFPictureData;
 import org.apache.poi.hssf.usermodel.HSSFShape;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFSimpleShape;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.sl.usermodel.AutoShape;
 import org.apache.poi.sl.usermodel.Slide;
 import org.apache.poi.ss.usermodel.Sheet;
 import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.ss.usermodel.WorkbookFactory;
 import org.apache.poi.util.IOUtils;
 import org.apache.poi.xssf.usermodel.XSSFDrawing;
 import org.apache.poi.xssf.usermodel.XSSFPicture;
 import org.apache.poi.xssf.usermodel.XSSFPictureData;
 import org.apache.poi.xssf.usermodel.XSSFShape;
 import org.apache.poi.xssf.usermodel.XSSFSheet;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTPicture;


 public class EmbeddedReader {

public static final OleType OLE10_PACKAGE = new OleType("{0003000C-0000-0000-C000-000000000046}");
public static final OleType PPT_SHOW = new OleType("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
public static final OleType XLS_WORKBOOK = new OleType("{00020841-0000-0000-C000-000000000046}");
public static final OleType TXT_ONLY = new OleType("{5e941d80-bf96-11cd-b579-08002b30bfeb}");
public static final OleType EXCEL97 = new OleType("{00020820-0000-0000-C000-000000000046}");
public static final OleType EXCEL95 = new OleType("{00020810-0000-0000-C000-000000000046}");
public static final OleType WORD97 = new OleType("{00020906-0000-0000-C000-000000000046}");
public static final OleType WORD95 = new OleType("{00020900-0000-0000-C000-000000000046}");
public static final OleType POWERPOINT97 = new OleType("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
public static final OleType POWERPOINT95 = new OleType("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}");
public static final OleType EQUATION30 = new OleType("{0002CE02-0000-0000-C000-000000000046}");
public static final OleType PdfClassID = new OleType("{B801CA65-A1FC-11D0-85AD-444553540000}");

private File excel_file;
private ImageReader image_reader;

static class OleType {
    final String classId;

    OleType(String classId) {
        this.classId = classId;
    }

    ClassID getClassID() {
        ClassID cls = new ClassID();
        byte clsBytes[] = cls.getBytes();
        String clsStr = classId.replaceAll("[{}-]", "");
        for (int i = 0; i < clsStr.length(); i += 2) {
            clsBytes[i / 2] = (byte) Integer.parseInt(
                    clsStr.substring(i, i + 2), 16);
        }
        return cls;
    }
}

public static void main(String[] args) throws Exception {
    File sample = new File("D:\\ole_ppt_in_xls.xls");
    ImageReader ir = new ImageReader(sample);
    for (EmbeddedData ed : ir.embeddings) {
    FileOutputStream fos = new FileOutputStream(System.getProperty("user.home") + "/Desktop" + "/sumit/"+ ed.filename);
        IOUtils.copy(ed.is, fos);
        fos.close();
    }
    ir.close();
}

static byte[] getSamplePng() throws IOException {
    ClassLoader cl = Thread.currentThread().getContextClassLoader();
    URL imgUrl = cl.getResource("javax/swing/plaf/metal/icons/ocean/directory.gif");
    BufferedImage img = ImageIO.read(imgUrl);
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    ImageIO.write(img, "PNG", bos);
    return bos.toByteArray();
}


public EmbeddedReader(String excel_path) throws IOException {
    excel_file = new File(excel_path);
    image_reader = new ImageReader(excel_file);
}

public String[] get_file_names() {
    ArrayList<String> file_names = new ArrayList<String>();
    for (EmbeddedData ed : image_reader.embeddings) {
        file_names.add(ed.filename);
    }
    return file_names.toArray(new String[file_names.size()]);
}

public InputStream get_stream(String file_name) {
    InputStream input_stream = null;
    for (EmbeddedData ed : image_reader.embeddings) {
        if (file_name.equals(ed.filename)) {
            input_stream = ed.is;
            break;
        }
    }
    return input_stream;
}

static class ImageReader implements Closeable {
    EmbeddedExtractor extractors[] = { new Ole10Extractor(),new PdfExtractor(), new WordExtractor(), new ExcelExtractor(),new FsExtractor() };

    List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>();
    Workbook wb;

    public ImageReader(File excelfile) throws IOException {
        try {
            wb = WorkbookFactory.create(excelfile);
            Sheet receiptImages = wb.getSheet("Receipt images");
            if (wb instanceof XSSFWorkbook) {
                addSheetPicsAndEmbedds((XSSFSheet) receiptImages);
            } else {
                addAllEmbedds((HSSFWorkbook) wb);
                addSheetPics((HSSFSheet) receiptImages);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    protected void addSheetPicsAndEmbedds(XSSFSheet sheet)throws IOException {
        if (sheet == null)
            return;
        XSSFDrawing draw = sheet.createDrawingPatriarch();
        for (XSSFShape shape : draw.getShapes()) {
            if (!(shape instanceof XSSFPicture))
                continue;
            XSSFPicture picture = (XSSFPicture) shape;
            XSSFPictureData pd = picture.getPictureData();
            PackagePart pp = pd.getPackagePart();
            CTPicture ctPic = picture.getCTPicture();
            String filename = null;
            try {
                filename = ctPic.getNvPicPr().getCNvPr().getName();
            } catch (Exception e) {
            }
            if (filename == null || "".equals(filename)) {
                filename = new File(pp.getPartName().toString()).getName();
            }
            EmbeddedData ed = new EmbeddedData();
            ed.filename = fileNameWithoutPath(filename);
            ed.is = pp.getInputStream();
            embeddings.add(ed);
        }
    }

    protected void addAllEmbedds(HSSFWorkbook hwb) throws IOException {
        for (HSSFObjectData od : hwb.getAllEmbeddedObjects()) {
            String alternativeName = getAlternativeName(od);
            if (od.hasDirectoryEntry()) {
                DirectoryNode src = (DirectoryNode) od.getDirectory();
                for (EmbeddedExtractor ee : extractors) {
                    if (ee.canExtract(src)) {
                        EmbeddedData ed = ee.extract(src);
                        if (ed.filename == null || ed.filename.startsWith("MBD")|| alternativeName != null) {
                            if (alternativeName != null) {
                                ed.filename = alternativeName;
                            }
                        }
                        ed.filename = fileNameWithoutPath(ed.filename);
                        ed.source = "object";
                        embeddings.add(ed);
                        break;
                    }
                }
            }
        }
    }

    protected String getAlternativeName(HSSFShape shape) {
    EscherOptRecord eor = reflectEscherOptRecord(shape);
        if (eor == null) {
            return null;
        }
        for (EscherProperty ep : eor.getEscherProperties()) {
            if ("groupshape.shapename".equals(ep.getName())
                    && ep.isComplex()) {
                return new String(
                        ((EscherComplexProperty) ep).getComplexData(),
                        Charset.forName("UTF-16LE"));
            }
        }
        return null;
    }

    protected void addSheetPics(HSSFSheet sheet) {
        if (sheet == null)
            return;
        int picIdx = 0;
        int emfIdx = 0;
        HSSFPatriarch patriarch = sheet.getDrawingPatriarch();
        if (patriarch == null)
            return;
        // Loop through the objects
        for (HSSFShape shape : patriarch.getChildren()) {
            if (!(shape instanceof HSSFPicture)) {
                continue;
            }
            HSSFPicture picture = (HSSFPicture) shape;
            if (picture.getShapeType() != HSSFSimpleShape.OBJECT_TYPE_PICTURE)
                continue;
            HSSFPictureData pd = picture.getPictureData();
            byte pictureBytes[] = pd.getData();
            int pictureBytesOffset = 0;
            int pictureBytesLen = pictureBytes.length;
            String filename = picture.getFileName();
            // try to find an alternative name
            if (filename == null || "".equals(filename)) {
                filename = getAlternativeName(picture);
            }
            // default to dummy name
            if (filename == null || "".equals(filename)) {
                filename = "picture" + (picIdx++);
            }
            filename = filename.trim();

            // check for emf+ embedded pdf (poor mans style :( )
            // Mac Excel 2011 embeds pdf files with this method.
            boolean validFile = true;
            if (pd.getFormat() == Workbook.PICTURE_TYPE_EMF) {
                validFile = false;
                int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes());
                if (idxStart != -1) {
                    int idxEnd = indexOf(pictureBytes, idxStart,"%%EOF".getBytes());
                    if (idxEnd != -1) {
                        pictureBytesOffset = idxStart;
                        pictureBytesLen = idxEnd - idxStart + 6;
                        validFile = true;
                    }
                } else {
                    // This shape was not a Mac Excel 2011 embedded pdf  file.
                    // So this is a shape related to a regular embedded object
                    // Lets update the object filename with the shapes filename
                    // if the object filename is of format ARGF1234.pdf
                    EmbeddedData ed_obj = embeddings.get(emfIdx);
                    Pattern pattern = Pattern
                            .compile("^[A-Z0-9]{8}\\.[pdfPDF]{3}$");
                    Matcher matcher = pattern.matcher(ed_obj.filename);
                    if (matcher.matches()) {
                        ed_obj.filename = filename;
                    }
                    emfIdx += 1;
                }
            }

            EmbeddedData ed = new EmbeddedData();
            ed.filename = fileNameWithoutPath(filename);
            ed.is = new ByteArrayInputStream(pictureBytes,
                    pictureBytesOffset, pictureBytesLen);
            if (fileNotInEmbeddings(ed.filename) && validFile) {
                embeddings.add(ed);
            }
        }
    }

    private static EscherOptRecord reflectEscherOptRecord(HSSFShape shape) {
        try {
            Method m = HSSFShape.class.getDeclaredMethod("getOptRecord");
            m.setAccessible(true);
            return (EscherOptRecord) m.invoke(shape);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    private String fileNameWithoutPath(String filename) {
        int last_index = filename.lastIndexOf("\\");
        return filename.substring(last_index + 1);
    }

    private boolean fileNotInEmbeddings(String filename) {
        boolean exists = true;
        for (EmbeddedData ed : embeddings) {
            if (ed.filename.equals(filename)) {
                exists = false;
            }
        }
        return exists;
    }

    public void close() throws IOException {
        Iterator<EmbeddedData> ed = embeddings.iterator();
        while (ed.hasNext()) {
            ed.next().is.close();
        }
        wb.close();
    }
}

static class EmbeddedData {
    String filename;
    InputStream is;
    String source;
}

static abstract class EmbeddedExtractor {
    abstract boolean canExtract(DirectoryNode dn);
    abstract EmbeddedData extract(DirectoryNode dn) throws IOException;
    protected EmbeddedData extractFS(DirectoryNode dn, String filename)
            throws IOException {
        assert (canExtract(dn));
        POIFSFileSystem dest = new POIFSFileSystem();
        copyNodes(dn, dest.getRoot());
        EmbeddedData ed = new EmbeddedData();
        ed.filename = filename;
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        dest.writeFilesystem(bos);
         bos.close();
        ed.is = new ByteArrayInputStream(bos.toByteArray());
        return ed;
    }
}

static class Ole10Extractor extends EmbeddedExtractor {
    public boolean canExtract(DirectoryNode dn) {
        ClassID clsId = dn.getStorageClsid();
        return OLE10_PACKAGE.equals(clsId);
    }

    public EmbeddedData extract(DirectoryNode dn) throws IOException {
        try {
            Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
            EmbeddedData ed = new EmbeddedData();
            ed.filename = new File(ole10.getFileName()).getName();
            ed.is = new ByteArrayInputStream(ole10.getDataBuffer());
            return ed;
        } catch (Ole10NativeException e) {
            e.printStackTrace();
            throw new IOException(e);
        }
    }
}

static class PdfExtractor extends EmbeddedExtractor {
    public boolean canExtract(DirectoryNode dn) {
        ClassID clsId = dn.getStorageClsid();
        return (PdfClassID.equals(clsId) || dn.hasEntry("CONTENTS"));
    }

    public EmbeddedData extract(DirectoryNode dn) throws IOException {
        EmbeddedData ed = new EmbeddedData();
        ed.is = dn.createDocumentInputStream("CONTENTS");
        ed.filename = dn.getName() + ".pdf";
        return ed;
    }
}

static class WordExtractor extends EmbeddedExtractor {
    public boolean canExtract(DirectoryNode dn) {
        ClassID clsId = dn.getStorageClsid();
        return (WORD95.equals(clsId) || WORD97.equals(clsId) || dn.hasEntry("WordDocument"));
    }

    public EmbeddedData extract(DirectoryNode dn) throws IOException {
        return extractFS(dn, dn.getName() + ".doc");
    }
}

static class ExcelExtractor extends EmbeddedExtractor {
    public boolean canExtract(DirectoryNode dn) {
        ClassID clsId = dn.getStorageClsid();
        return (EXCEL95.equals(clsId) || EXCEL97.equals(clsId) || dn
                .hasEntry("Workbook") /* ... */);
    }

    public EmbeddedData extract(DirectoryNode dn) throws IOException {
        return extractFS(dn, dn.getName() + ".xls");
    }
}

static class FsExtractor extends EmbeddedExtractor {

    public boolean canExtract(DirectoryNode dn) {
        return true;
    }

    public EmbeddedData extract(DirectoryNode dn) throws IOException {
        return extractFS(dn, dn.getName() + ".dat");
    }
}

private static void copyNodes(DirectoryNode src, DirectoryNode dest)
        throws IOException {
    for (Entry e : src) {
        if (e instanceof DirectoryNode) {
            DirectoryNode srcDir = (DirectoryNode) e;
            DirectoryNode destDir = (DirectoryNode) dest
                    .createDirectory(srcDir.getName());
            destDir.setStorageClsid(srcDir.getStorageClsid());
            copyNodes(srcDir, destDir);
        } else {
            InputStream is = src.createDocumentInputStream(e);
            dest.createDocument(e.getName(), is);
            is.close();
        }
    }
}

/**
 * Knuth-Morris-Pratt Algorithm for Pattern Matching Finds the first
 * occurrence of the pattern in the text.
 */
private static int indexOf(byte[] data, int offset, byte[] pattern) {
    int[] failure = computeFailure(pattern);

    int j = 0;
    if (data.length == 0)
        return -1;

    for (int i = offset; i < data.length; i++) {
        while (j > 0 && pattern[j] != data[i]) {
            j = failure[j - 1];
        }
        if (pattern[j] == data[i]) {
            j++;
        }
        if (j == pattern.length) {
            return i - pattern.length + 1;
        }
    }
    return -1;
}

/**
 * Computes the failure function using a boot-strapping process, where the
 * pattern is matched against itself.
 */
private static int[] computeFailure(byte[] pattern) {
    int[] failure = new int[pattern.length];

    int j = 0;
    for (int i = 1; i < pattern.length; i++) {
        while (j > 0 && pattern[j] != pattern[i]) {
            j = failure[j - 1];
        }
        if (pattern[j] == pattern[i]) {
            j++;
        }
        failure[i] = j;
    }

    return failure;
}

}

Upvotes: 1

kiwiwings
kiwiwings

Reputation: 3446

This is partly a duplicate of How to get pictures with names from an xls file using Apache POI, for which I've written the original paste.

As per request, I've added also an example of how to add and embedding with the help of a OLE 1.0 packager - in the meantime I've added the code to POI, so this easier now. For the OOXML based files have a look into this answer.

So the code iterates through all shapes of the DrawingPatriarch and extracts the pictures and embedded files.

I've added the full code - instead of a snippet - to this answer, as I expect the next "why can't I export this kind of embedding" to come up soon ...

package poijartest;

import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.imageio.ImageIO;

import org.apache.poi.ddf.EscherComplexProperty;
import org.apache.poi.ddf.EscherOptRecord;
import org.apache.poi.ddf.EscherProperty;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hssf.usermodel.HSSFClientAnchor;
import org.apache.poi.hssf.usermodel.HSSFObjectData;
import org.apache.poi.hssf.usermodel.HSSFPatriarch;
import org.apache.poi.hssf.usermodel.HSSFPicture;
import org.apache.poi.hssf.usermodel.HSSFPictureData;
import org.apache.poi.hssf.usermodel.HSSFShape;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFSimpleShape;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.usermodel.AutoShape;
import org.apache.poi.sl.usermodel.ShapeType;
import org.apache.poi.sl.usermodel.Slide;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFPicture;
import org.apache.poi.xssf.usermodel.XSSFPictureData;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTPicture;

/**
 * Tested with POI 3.16-beta1
 * 
 * 17.12.2014: original version for
 *    http://apache-poi.1045710.n5.nabble.com/How-to-get-the-full-file-name-of-a-picture-in-xls-file-td5717205.html
 * 
 * 17.12.2016: added sample/dummy data for
 *    https://stackoverflow.com/questions/41101012/how-to-export-embeded-file-which-from-excel-using-poi 
 */
public class EmbeddedReader {

    private File excel_file;
    private ImageReader image_reader;

    public static void main(String[] args) throws Exception {
        File sample = new File("bla.xls");
        getSampleEmbedded(sample);
        ImageReader ir = new ImageReader(sample);

        for (EmbeddedData ed : ir.embeddings) {
            System.out.println(ed.filename);
            FileOutputStream fos = new FileOutputStream(ed.filename);
            IOUtils.copy(ed.is, fos);
            fos.close();
        }

        ir.close();
    }

    static void getSampleEmbedded(File sample) throws IOException {
        HSSFWorkbook wb = new HSSFWorkbook();
        int storageId = wb.addOlePackage(getSamplePPT(), "dummy.ppt", "dummy.ppt", "dummy.ppt");
        int picId = wb.addPicture(getSamplePng(), HSSFPicture.PICTURE_TYPE_PNG);
        HSSFSheet sheet = wb.createSheet();
        HSSFPatriarch pat = sheet.createDrawingPatriarch();
        HSSFClientAnchor anc = pat.createAnchor(0, 0, 0, 0, 1, 1, 3, 6);
        HSSFObjectData od = pat.createObjectData(anc, storageId, picId);
        od.setNoFill(true);
        wb.write(sample);
        wb.close();
    }

    static byte[] getSamplePng() throws IOException {
        ClassLoader cl = Thread.currentThread().getContextClassLoader();
        URL imgUrl = cl.getResource("javax/swing/plaf/metal/icons/ocean/directory.gif");
        BufferedImage img = ImageIO.read(imgUrl);
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        ImageIO.write(img, "PNG", bos);
        return bos.toByteArray();
    }

    static byte[] getSamplePPT() throws IOException {
        HSLFSlideShow ppt = new HSLFSlideShow();
        Slide<?,?> slide = ppt.createSlide();

        AutoShape<?,?> sh1 = slide.createAutoShape();
        sh1.setShapeType(ShapeType.STAR_32);
        sh1.setAnchor(new java.awt.Rectangle(50, 50, 100, 200));
        sh1.setFillColor(Color.red);

        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        ppt.write(bos);
        ppt.close();

        POIFSFileSystem poifs = new POIFSFileSystem(new ByteArrayInputStream(bos.toByteArray()));
        poifs.getRoot().setStorageClsid(ClassID.PPT_SHOW);

        bos.reset();
        poifs.writeFilesystem(bos);
        poifs.close();

        return bos.toByteArray();
    }

    public EmbeddedReader(String excel_path) throws IOException {
        excel_file = new File(excel_path);
        image_reader = new ImageReader(excel_file);
    }

    public String[] get_file_names() {
        ArrayList<String> file_names = new ArrayList<String>();
        for (EmbeddedData ed : image_reader.embeddings) {
            file_names.add(ed.filename);
        }
        return file_names.toArray(new String[file_names.size()]);
    }

    public InputStream get_stream(String file_name) {
        InputStream input_stream = null;
        for (EmbeddedData ed : image_reader.embeddings) {
            if(file_name.equals(ed.filename)) {
                input_stream = ed.is;
                break;
            }
        }
        return input_stream;
    }

    static class ImageReader implements Closeable {
        EmbeddedExtractor extractors[] = {
            new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
        };

        List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>();
        Workbook wb;

        public ImageReader(File excelfile) throws IOException {
            try {
                wb = WorkbookFactory.create(excelfile);
                Sheet receiptImages = wb.getSheet("Receipt images");
                if (wb instanceof XSSFWorkbook) {
                    addSheetPicsAndEmbedds((XSSFSheet)receiptImages);
                } else {
                    addAllEmbedds((HSSFWorkbook)wb);
                    addSheetPics((HSSFSheet)receiptImages);
                }
            } catch (Exception e) {
                // todo: error handling
            }
        }

        protected void addSheetPicsAndEmbedds(XSSFSheet sheet) throws IOException {
            if (sheet == null) return;
            XSSFDrawing draw = sheet.createDrawingPatriarch();
            for (XSSFShape shape : draw.getShapes()) {
                if (!(shape instanceof XSSFPicture)) continue;
                XSSFPicture picture = (XSSFPicture)shape;
                XSSFPictureData pd = picture.getPictureData();
                PackagePart pp = pd.getPackagePart();
                CTPicture ctPic = picture.getCTPicture();
                String filename = null;
                try {
                    filename = ctPic.getNvPicPr().getCNvPr().getName();
                } catch (Exception e) {}
                if (filename == null || "".equals(filename)) {
                    filename = new File(pp.getPartName().toString()).getName();
                }
                EmbeddedData ed = new EmbeddedData();
                ed.filename = fileNameWithoutPath(filename);
                ed.is = pp.getInputStream();
                embeddings.add(ed);
            }
        }

        protected void addAllEmbedds(HSSFWorkbook hwb) throws IOException {
            for (HSSFObjectData od : hwb.getAllEmbeddedObjects()) {
                String alternativeName = getAlternativeName(od);
                if (od.hasDirectoryEntry()) {
                    DirectoryNode src = (DirectoryNode)od.getDirectory();
                    for (EmbeddedExtractor ee : extractors) {
                        if (ee.canExtract(src)) {
                            EmbeddedData ed = ee.extract(src);
                            if (ed.filename == null || ed.filename.startsWith("MBD") || alternativeName != null) {
                                ed.filename = alternativeName;
                            }
                            ed.filename = fileNameWithoutPath(ed.filename);
                            ed.source = "object";
                            embeddings.add(ed);
                            break;
                        }
                    }
                }
            }
        }

        protected String getAlternativeName(HSSFShape shape) {
            EscherOptRecord eor = reflectEscherOptRecord(shape);
            if (eor == null) return null;
            for (EscherProperty ep : eor.getEscherProperties()) {
                if ("groupshape.shapename".equals(ep.getName()) && ep.isComplex()) {
                    return new String(((EscherComplexProperty)ep).getComplexData(),
                            Charset.forName("UTF-16LE"));
                }
            }
            return null;
        }

        protected void addSheetPics(HSSFSheet sheet) {
            if (sheet == null) return;
            int picIdx=0;
            int emfIdx = 0;
            HSSFPatriarch patriarch = sheet.getDrawingPatriarch();
            if (patriarch == null) return;
            // Loop through the objects
            for (HSSFShape shape : patriarch.getChildren()) {
                if (!(shape instanceof HSSFPicture)) {
                    continue;
                }
                HSSFPicture picture = (HSSFPicture) shape;
                if (picture.getShapeType() != HSSFSimpleShape.OBJECT_TYPE_PICTURE) continue;
                HSSFPictureData pd = picture.getPictureData();
                byte pictureBytes[] = pd.getData();
                int pictureBytesOffset = 0;
                int pictureBytesLen = pictureBytes.length;
                String filename = picture.getFileName();
                // try to find an alternative name
                if (filename == null || "".equals(filename)) {
                    filename = getAlternativeName(picture);
                }
                // default to dummy name
                if (filename == null || "".equals(filename)) {
                    filename = "picture"+(picIdx++);
                }
                filename = filename.trim();


                // check for emf+ embedded pdf (poor mans style :( )
                // Mac Excel 2011 embeds pdf files with this method.
                boolean validFile = true;
                if (pd.getFormat() == Workbook.PICTURE_TYPE_EMF) {
                    validFile = false;
                    int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes());
                    if (idxStart != -1) {
                        int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes());
                        if (idxEnd != -1) {
                            pictureBytesOffset = idxStart;
                            pictureBytesLen = idxEnd-idxStart+6;
                            validFile = true;
                        }
                    } else {
                        // This shape was not a Mac Excel 2011 embedded pdf file.
                        // So this is a shape related to a regular embedded object
                        // Lets update the object filename with the shapes filename
                        // if the object filename is of format ARGF1234.pdf
                        EmbeddedData ed_obj = embeddings.get(emfIdx);
                        Pattern pattern = Pattern.compile("^[A-Z0-9]{8}\\.[pdfPDF]{3}$");
                        Matcher matcher = pattern.matcher(ed_obj.filename);
                        if(matcher.matches()) {
                            ed_obj.filename = filename;
                        }
                        emfIdx += 1;
                    }
                }

                EmbeddedData ed = new EmbeddedData();
                ed.filename = fileNameWithoutPath(filename);
                ed.is = new ByteArrayInputStream(pictureBytes, pictureBytesOffset, pictureBytesLen);
                if(fileNotInEmbeddings(ed.filename) && validFile) {
                    embeddings.add(ed);
                }
            }
        }

        private static EscherOptRecord reflectEscherOptRecord(HSSFShape shape) {
            try {
                Method m = HSSFShape.class.getDeclaredMethod("getOptRecord");
                m.setAccessible(true);
                return (EscherOptRecord)m.invoke(shape);
            } catch (Exception e) {
                // todo: log ... well actually "should not happen" ;)
                return null;
            }
        }

        private String fileNameWithoutPath(String filename) {
            int last_index = filename.lastIndexOf("\\");
            return filename.substring(last_index + 1);
        }

        private boolean fileNotInEmbeddings(String filename) {
            boolean exists = true;
            for(EmbeddedData ed : embeddings) {
                if(ed.filename.equals(filename)) {
                    exists = false;
                }
            }
            return exists;
        }

        public void close() throws IOException {
            Iterator<EmbeddedData> ed = embeddings.iterator();
            while (ed.hasNext()) {
                ed.next().is.close();
            }
            wb.close();
        }
    }

    static class EmbeddedData {
        String filename;
        InputStream is;
        String source;
    }

    static abstract class EmbeddedExtractor {
        abstract boolean canExtract(DirectoryNode dn);
        abstract EmbeddedData extract(DirectoryNode dn) throws IOException;
        protected EmbeddedData extractFS(DirectoryNode dn, String filename) throws IOException {
            assert(canExtract(dn));
            POIFSFileSystem dest = new POIFSFileSystem();
            copyNodes(dn, dest.getRoot());
            EmbeddedData ed = new EmbeddedData();
            ed.filename = filename;
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            dest.writeFilesystem(bos);
            dest.close();
            ed.is = new ByteArrayInputStream(bos.toByteArray());
            return ed;
        }
    }

    static class Ole10Extractor extends EmbeddedExtractor {
        public boolean canExtract(DirectoryNode dn) {
            ClassID clsId = dn.getStorageClsid();
            return ClassID.OLE10_PACKAGE.equals(clsId);
        }
        public EmbeddedData extract(DirectoryNode dn) throws IOException {
            try {
                Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
                EmbeddedData ed = new EmbeddedData();
                ed.filename = new File(ole10.getFileName()).getName();
                ed.is = new ByteArrayInputStream(ole10.getDataBuffer());
                return ed;
            } catch (Ole10NativeException e) {
                throw new IOException(e);
            }
        }
    }

    static class PdfExtractor extends EmbeddedExtractor {
        static ClassID PdfClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}");
        public boolean canExtract(DirectoryNode dn) {
            ClassID clsId = dn.getStorageClsid();
            return (PdfClassID.equals(clsId)
            || dn.hasEntry("CONTENTS"));
        }
        public EmbeddedData extract(DirectoryNode dn) throws IOException {
            EmbeddedData ed = new EmbeddedData();
            ed.is = dn.createDocumentInputStream("CONTENTS");
            ed.filename = dn.getName()+".pdf";
            return ed;
        }
    }

    static class WordExtractor extends EmbeddedExtractor {
        public boolean canExtract(DirectoryNode dn) {
            ClassID clsId = dn.getStorageClsid();
            return (ClassID.WORD95.equals(clsId)
            || ClassID.WORD97.equals(clsId)
            || dn.hasEntry("WordDocument"));
        }
        public EmbeddedData extract(DirectoryNode dn) throws IOException {
            return extractFS(dn, dn.getName()+".doc");
        }
    }

    static class ExcelExtractor extends EmbeddedExtractor {
        public boolean canExtract(DirectoryNode dn) {
            ClassID clsId = dn.getStorageClsid();
            return (ClassID.EXCEL95.equals(clsId)
                    || ClassID.EXCEL97.equals(clsId)
                    || dn.hasEntry("Workbook") /*...*/);
        }
        public EmbeddedData extract(DirectoryNode dn) throws IOException {
            return extractFS(dn, dn.getName()+".xls");
        }
    }

    static class FsExtractor extends EmbeddedExtractor {
        public boolean canExtract(DirectoryNode dn) {
            return true;
        }
        public EmbeddedData extract(DirectoryNode dn) throws IOException {
            return extractFS(dn, dn.getName()+".dat");
        }
    }

    private static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException {
        for (Entry e : src) {
            if (e instanceof DirectoryNode) {
                DirectoryNode srcDir = (DirectoryNode)e;
                DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName());
                destDir.setStorageClsid(srcDir.getStorageClsid());
                copyNodes(srcDir, destDir);
            } else {
                InputStream is = src.createDocumentInputStream(e);
                dest.createDocument(e.getName(), is);
                is.close();
            }
        }
    }


    /**
     * Knuth-Morris-Pratt Algorithm for Pattern Matching
     * Finds the first occurrence of the pattern in the text.
     */
    private static int indexOf(byte[] data, int offset, byte[] pattern) {
        int[] failure = computeFailure(pattern);

        int j = 0;
        if (data.length == 0) return -1;

        for (int i = offset; i < data.length; i++) {
            while (j > 0 && pattern[j] != data[i]) {
                j = failure[j - 1];
            }
            if (pattern[j] == data[i]) { j++; }
            if (j == pattern.length) {
                return i - pattern.length + 1;
            }
        }
        return -1;
    }

    /**
     * Computes the failure function using a boot-strapping process,
     * where the pattern is matched against itself.
     */
    private static int[] computeFailure(byte[] pattern) {
        int[] failure = new int[pattern.length];

        int j = 0;
        for (int i = 1; i < pattern.length; i++) {
            while (j > 0 && pattern[j] != pattern[i]) {
                j = failure[j - 1];
            }
            if (pattern[j] == pattern[i]) {
                j++;
            }
            failure[i] = j;
        }

        return failure;
    }
}

Upvotes: 2

Related Questions