Reputation:
i have checked everywhere online and stackoverflow and could not find a match specific to this issue. I am trying to extract a pdf file that is located in a zip file that is inside a zip file (nested zips). Re-calling the method i am using to extract does not work nor does changing the whole program to accept Inputstreams instead of how i am doing it below. The .pdf file inside the nested zip is just skipped at this stage
public static void main(String[] args)
{
try
{
//Paths
String basePath = "C:\\Users\\user\\Desktop\\Scan\\";
File lookupDir = new File(basePath + "Data\\");
String doneFolder = basePath + "DoneUnzipping\\";
File[] directoryListing = lookupDir.listFiles();
for (int i = 0; i < directoryListing.length; i++)
{
if (directoryListing[i].isFile()) //there's definately a file
{
//Save the current file's path
String pathOrigFile = directoryListing[i].getAbsolutePath();
Path origFileDone = Paths.get(pathOrigFile);
Path newFileDone = Paths.get(doneFolder + directoryListing[i].getName());
//unzip it
if(directoryListing[i].getName().toUpperCase().endsWith(ZIP_EXTENSION)) //ZIP files
{
unzip(directoryListing[i].getAbsolutePath(), DESTINATION_DIRECTORY + directoryListing[i].getName());
//move to the 'DoneUnzipping' folder
Files.move(origFileDone, newFileDone);
}
}
}
} catch (Exception e)
{
e.printStackTrace(System.out);
}
}
private static void unzip(String zipFilePath, String destDir)
{
//buffer for read and write data to file
byte[] buffer = new byte[BUFFER_SIZE];
try (ZipInputStream zis = new ZipInputStream(new FileInputStream(zipFilePath)))
{
FileInputStream fis = new FileInputStream(zipFilePath);
ZipEntry ze = zis.getNextEntry();
while(ze != null)
{
String fileName = ze.getName();
int index = fileName.lastIndexOf("/");
String newFileName = fileName.substring(index + 1);
File newFile = new File(destDir + File.separator + newFileName);
//Zips inside zips
if(fileName.toUpperCase().endsWith(ZIP_EXTENSION))
{
ZipInputStream innerZip = new ZipInputStream(zis);
ZipEntry innerEntry = null;
while((innerEntry = innerZip.getNextEntry()) != null)
{
System.out.println("The file: " + fileName);
if(fileName.toUpperCase().endsWith("PDF"))
{
FileOutputStream fos = new FileOutputStream(newFile);
int len;
while ((len = innerZip.read(buffer)) > 0)
{
fos.write(buffer, 0, len);
}
fos.close();
}
}
}
//close this ZipEntry
zis.closeEntry(); // java.io.IOException: Stream Closed
ze = zis.getNextEntry();
}
//close last ZipEntry
zis.close();
fis.close();
} catch (IOException e)
{
e.printStackTrace();
}
}
Upvotes: 1
Views: 667
Reputation:
So I found no way to do as my question states, and no one answered with the below (so sorry for answering my own question posted). The problem comes in when reading the path of the zip inside the zip. What needs to happen, and what is illustrated below, is the nested zip needs to be moved to a temp folder and extracted there and removed afterwards. So the unzip function is called as many times as there are nested zips.
I struggled a while with this one, hope it helps someone somewhere along the line..
import java.util.logging.Logger;
import java.util.logging.FileHandler;
import java.util.logging.SimpleFormatter;
import java.util.zip.ZipInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.io.File;
import java.io.FileOutputStream;
import java.util.zip.ZipEntry;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.PrintWriter;
public class Unzipper
{
private static String baseDir = "";
private static String PDF_DESTINATION_DIRECTORY = "";
private static String extractionLogFile = "";
private static final int BUFFER_SIZE = 4096;
private static int count = 0;
private static int loggedCount = 0;
private static String ZIP_EXTENSION = "ZIP";
public static void main(String[] args)
{
baseDir = File.separator + "Users" + File.separator + "user" + File.separator + "Desktop" + File.separator + "ZipFolder" + File.separator;
PDF_DESTINATION_DIRECTORY = File.separator +"Users" + File.separator + "user" + File.separator +"Desktop" + File.separator + "ZipFolder" + File.separator + "PDFs" + File.separator;
extractionLogFile = File.separator + "Users" + File.separator + "user" + File.separator +"Desktop" + File.separator + "ZipFolder" + File.separator + "ExtractionLogFile.txt";
try
{
//Paths
File lookupDir = new File(baseDir);
String doneFolder = PDF_DESTINATION_DIRECTORY;
//Logger properties
Logger logger = Logger.getLogger("ExtractionLog");
FileHandler fh;
fh = new FileHandler(extractionLogFile);
logger.addHandler(fh);
logger.setUseParentHandlers(false);
SimpleFormatter formatter = new SimpleFormatter();
fh.setFormatter(formatter);
//make some folders if they are not there
makeDirIfNotExist(baseDir + "PDFs");
makeDirIfNotExist(baseDir + File.separator + "_Temp");
File[] directoryListing = lookupDir.listFiles();
for (int i = 0; i < directoryListing.length; i++)
{
if (directoryListing[i].isFile()) //there's definately a file
{
//Save the current file's path
String pathOrigFile = directoryListing[i].getAbsolutePath();
Path origFileDone = Paths.get(pathOrigFile);
Path newFileDone = Paths.get(doneFolder + directoryListing[i].getName());
//make sure directory exist
String dirPDFdestName = directoryListing[i].getName();
makeDirIfNotExist(PDF_DESTINATION_DIRECTORY.concat(dirPDFdestName.substring(0, dirPDFdestName.length() - 4)));
//unzip it
if (directoryListing[i].getName().toUpperCase().endsWith(ZIP_EXTENSION)) //ZIP files
{
checkTheZip(directoryListing[i].getAbsolutePath(), PDF_DESTINATION_DIRECTORY.concat(dirPDFdestName.substring(0, dirPDFdestName.length() - 4)));
//move to the 'PDFs' folder
moveFile(origFileDone, newFileDone);
}
}
}
logger.info("Cycle completed, Processed files: " + loggedCount); // (just checking)
loggedCount = 0;
} catch (Exception e)
{
appendToFile(e);
}
}
//dig into the zip file
private static void checkTheZip(String zipFilePath, String destDirName)
{
unzip(zipFilePath, destDirName);
loggedCount++;
}
//move the file
private static void moveFile(Path fromDest, Path toDest)
{
File lookupDir = new File(toDest.toString());
try
{
if(!lookupDir.exists())
{
Files.move(fromDest, toDest); //, OPTIONAL: StandardCopyOption.REPLACE_EXISTING
}
} catch (Exception e)
{
appendToFile(e);
}
}
private static File makeDirIfNotExist(String directory)
{
File dir = new File(directory);
if (!dir.exists())
{
dir.mkdir();
}
return new File(directory + File.separator);
}
public static void appendToFile(Exception e)
{
try
{
FileWriter fstream = new FileWriter(extractionLogFile, true);
BufferedWriter out = new BufferedWriter(fstream);
PrintWriter pWriter = new PrintWriter(out, true);
e.printStackTrace(pWriter);
}
catch (Exception ie)
{
throw new RuntimeException("Could not write Exception to file", ie);
}
}
private static void unzip(String zipFilePath, String destDirName)
{
//buffer for read and write data to file
byte[] buffer = new byte[BUFFER_SIZE];
try (ZipInputStream zis = new ZipInputStream(new FileInputStream(zipFilePath)))
{
FileInputStream fis = new FileInputStream(zipFilePath);
ZipEntry ze = zis.getNextEntry();
while (ze != null)
{
String fileName = ze.getName();
int index = fileName.lastIndexOf(File.separator);
String newFileName = fileName.substring(index + 1);
File newFile = new File(destDirName + File.separator + newFileName);
//PDFs
if (fileName.toUpperCase().endsWith("PDF"))
{
FileOutputStream fos = new FileOutputStream(newFile);
int len;
while ((len = zis.read(buffer)) > 0)
{
fos.write(buffer, 0, len);
}
fos.close();
}
//Zips inside zips
if (fileName.toUpperCase().endsWith(ZIP_EXTENSION))
{
Path newFileDone = Paths.get(baseDir + "_Temp");
Path origFileDone = Paths.get(destDirName + File.separator);
newFile = new File(baseDir + "_Temp" + File.separator + newFileName);
FileOutputStream fos = new FileOutputStream(newFile);
int len;
while ((len = zis.read(buffer)) > 0)
{
fos.write(buffer, 0, len);
}
fos.close();
//move the zip out of the folder to a temp folder then
moveFile(origFileDone, newFileDone);
//search the temp folder for entries and handle the .zip file from there
checkTheZip(baseDir + "_Temp" + File.separator + fileName, destDirName);
//remove the Temp_ folders contents
Files.walk(Paths.get(baseDir + "_Temp" + File.separator))
.filter(Files::isRegularFile)
.map(Path::toFile)
.forEach(File::delete);
}
//close this ZipEntry
zis.closeEntry();
ze = zis.getNextEntry();
}
//close last ZipEntry
zis.close();
fis.close();
} catch (IOException e)
{
appendToFile(e);
}
}
}
Upvotes: 0
Reputation: 15196
The line that causes your problem looks to be auto-close block you have created when reading the inner zip:
try(ZipInputStream innerZip = new ZipInputStream(fis)) {
...
}
Several likely issues: firstly it is reading the wrong stream - fis
not the existing zis
.
Secondly, you shouldn't use try-with-resources for auto-close on innerZip
as this implicitly calls innerZip.close()
when exiting the block. If you view the source code of ZipInputStream
via a good IDE you should see (eventually) that ZipInputStream extends InflaterInputStream
which itself extends FilterInputStream
. A call to innerZip.close()
will close the underlying outer stream zis
(fis
in your case) hence stream is closed when you resume the next entry of the outer zip.
Therefore remove the try()
block and add use of zis
:
ZipInputStream innerZip = new ZipInputStream(zis);
Use try-catch block only for the outermost file handling:
try (ZipInputStream zis = new ZipInputStream(new FileInputStream(zipFilePath))) {
ZipEntry ze = zis.getNextEntry();
...
}
Thirdly, you appear to be copying the wrong stream when extracting a PDF - use innerZip
not outer zis
. The code will never extract PDF as these 2 lines can never be true at the same time because a file ending ZIP will never end PDF too:
if(fileName.toUpperCase().endsWith(ZIP_EXTENSION)) {
...
// You want innerEntry.getName() here
if(fileName.toUpperCase().endsWith("PDF"))
You should be able to switch to one line Files.copy
and make use of the PDF filename not zip filename:
if(innerEntry.getName().toUpperCase().endsWith("PDF")) {
Path newFile = Paths.get(destDir + '-'+innerEntry.getName().replace("/", "-"));
System.out.println("Files.copy to " + newFile);
Files.copy(innerZip, newFile);
}
Upvotes: 0
Reputation: 11902
Your question asks how to use java (by implication in windows) to extract a pdf from a zip inside another outer zip.
In many systems including windows it is a single line command that will depend on the location of source and target folders, however using the shortest example of current downloads folder it would be in a shell as simple as
tar -xf "german (2).zip" && tar -xf "german.zip" && german.pdf
to shell the command in windows see How do I execute Windows commands in Java?
The default pdf viewer can open the result so Windows Edge or in my case SumatraPDF
There is generally no point in putting a pdf inside a zip because it cannot be run in there. So single nesting would be advisable if needed for download transportation.
There is no need to add a password to the zip because PDF uses its own password for opening. Thus unwise to add two levels of complexity. Keep it simple.
If you have multiple zips nested inside multiple zips with multiple pdfs in each then you have to be more specific by filtering names. However avoid that extra onion skin where possible.
\Downloads>tar -xf "german (2).zip" "both.zip" && tar -xf "both.zip" "English language.pdf"
You could complicate that by run in a memory or temp folder but it is reliable and simple to use the native file system so consider without Java its fastest to run
CD /D "C:/Users/user/Desktop/Scan/DoneUnzipping" && for %f in (..\Data\*.zip) do tar -xf "%f" "*.zip" && for %f in (*.zip) do tar -xf "%f" "*.pdf" && del "*.zip"
This will extract all inner zips into working folder then extract all PDFs and remove all the essential temporary zips. The source double zips will not be deleted simply touched.
Upvotes: 0
Reputation: 2742
The solution to this is not as obvious as it seems. Despite writing a few zip utilities myself some time ago, getting zip entries from inside another zip file only seems obvious in retrospect
(and I also got the java.io.IOException: Stream Closed
on my first attempt).
The Java classes for ZipFile
and ZipInputStream
really direct your thinking into using the file system, but it is not required.
The functions below will scan a parent-level zip file, and continue scanning until it finds an entry with a specified name. (Nearly) everything is done in-memory.
Naturally, this can be modified to use different search criteria, find multiple file types, etc. and take different actions, but this at least demonstrates the basic technique in question -- zip files inside of zip files -- no guarantees on other aspects of the code, and someone more savvy could most likely improve the style.
final static String ZIP_EXTENSION = ".zip";
public static byte[] getOnePDF() throws IOException
{
final File source = new File("/path/to/MegaData.zip");
final String nameToFind = "FindThisFile.pdf";
final ByteArrayOutputStream mem = new ByteArrayOutputStream();
try (final ZipInputStream in = new ZipInputStream(new BufferedInputStream(new FileInputStream(source))))
{
digIntoContents(in, nameToFind, mem);
}
// Save to disk, if you want
// copy(new ByteArrayInputStream(mem.toByteArray()), new FileOutputStream(new File("/path/to/output.pdf")));
// Otherwise, just return the binary data
return mem.toByteArray();
}
private static void digIntoContents(final ZipInputStream in, final String nameToFind, final ByteArrayOutputStream mem) throws IOException
{
ZipEntry entry;
while (null != (entry = in.getNextEntry()))
{
final String name = entry.getName();
// Found the file we are looking for
if (name.equals(nameToFind))
{
copy(in, mem);
return;
}
// Found another zip file
if (name.toUpperCase().endsWith(ZIP_EXTENSION.toUpperCase()))
{
digIntoContents(new ZipInputStream(new ByteArrayInputStream(getZipEntryFromMemory(in))), nameToFind, mem);
}
}
}
private static byte[] getZipEntryFromMemory(final ZipInputStream in) throws IOException
{
final ByteArrayOutputStream mem = new ByteArrayOutputStream();
copy(in, mem);
return mem.toByteArray();
}
// General purpose, reusable, utility function
// OK for binary data (bad for non-ASCII text, use Reader/Writer instead)
public static void copy(final InputStream from, final OutputStream to) throws IOException
{
final int bufferSize = 4096;
final byte[] buf = new byte[bufferSize];
int len;
while (0 < (len = from.read(buf)))
{
to.write(buf, 0, len);
}
to.flush();
}
Upvotes: 0