Nigel Thomas
Nigel Thomas

Reputation: 1929

How to avoid skipping blank rows or columns in Apache POI

When I parse the file using Apace POI, the empty rows are getting skipped and a List of String arrays with non-empty rows are being returned.

How can I tell the API, not to skip reading the empty row or columns ?

The code to read is somehow skipping the rows with no data.

XExcelFileReader fileReader = new XExcelFileReader(excelByteArray, sheetFromWhichToRead);

                    dataFromFile = fileReader.readRows();

is used to read the data from the class XExcelFileReader. The variable dataFromFile is a List of String array.

This is the code to read the data row:

    import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Pattern;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.apache.commons.lang.StringUtils;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;

/**
 *  This class is responsible for reading the row contents for a given byte array and sheet name
 */
public class XExcelFileReader {
    /**
     * This is the Row Num of the current row that was read
     */
    private int rowNum = 0;
    /**
     * The OPCPackage is the package used to laod the Input Stream used for only .xlsx Files
     */
    private OPCPackage opcPkg;
    /**
     * These are the String Tables that are read from the Excel File
     */
    private ReadOnlySharedStringsTable stringsTable;
    /**
     * The XML Streaming API will be used
     */
    private XMLStreamReader xmlReader;

    /** The styles table which has formatting information about cells. */
    private StylesTable styles;

    /**
     * @param excelByteArray the excel byte array
     * @param sheetFromWhichToRead the excel sheet from which to read
     * @throws Exception the exception
     */
    public XExcelFileReader(final byte[] excelByteArray, final String sheetFromWhichToRead) throws Exception {

        InputStream excelStream = new ByteArrayInputStream(excelByteArray);
        opcPkg = OPCPackage.open(excelStream);
        this.stringsTable = new ReadOnlySharedStringsTable(opcPkg);

        XSSFReader xssfReader = new XSSFReader(opcPkg);
        styles = xssfReader.getStylesTable();
        XMLInputFactory factory = XMLInputFactory.newInstance();

        XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
        InputStream inputStream = null;
        while (iter.hasNext()) {
            inputStream = iter.next();
            String tempSheetName = iter.getSheetName();
            if (StringUtils.isNotEmpty(tempSheetName)) {
                tempSheetName = tempSheetName.trim();
                if (tempSheetName.equals(sheetFromWhichToRead)) {
                    break;
                }
            }
        }
        xmlReader = factory.createXMLStreamReader(inputStream);

        while (xmlReader.hasNext()) {
            xmlReader.next();
            if (xmlReader.isStartElement()) {
                if (xmlReader.getLocalName().equals("sheetData")) {
                    break;
                }
            }
        }
    }

    /**
     * @return rowNum
     */
    public final int rowNum() {
        return rowNum;
    }

    /**
     * @return List<String[]> List of String array which can hold the content
     * @throws XMLStreamException the XMLStreamException
     */
    public final List<String[]> readRows() throws XMLStreamException {
        String elementName = "row";
        List<String[]> dataRows = new ArrayList<String[]>();
        while (xmlReader.hasNext()) {
            xmlReader.next();
            if (xmlReader.isStartElement()) {
                if (xmlReader.getLocalName().equals(elementName)) {
                    rowNum++;
                    dataRows.add(getDataRow());
                    // TODO need to see if the batch Size is required
                    // if (dataRows.size() == batchSize)
                    // break;
                }
            }
        }

        return dataRows;
    }

    /**
     * @return String [] of Row Data
     * @throws XMLStreamException the XMLStreamException
     */
    private String[] getDataRow() throws XMLStreamException {
        List<String> rowValues = new ArrayList<String>();
        while (xmlReader.hasNext()) {
            xmlReader.next();
            if (xmlReader.isStartElement()) {
                if (xmlReader.getLocalName().equals("c")) {
                    CellReference cellReference = new CellReference(xmlReader.getAttributeValue(null, "r"));
                    // Fill in the possible blank cells!
                    while (rowValues.size() < cellReference.getCol()) {
                        rowValues.add("");
                    }
                    String cellType = xmlReader.getAttributeValue(null, "t");
                    String cellStyleStr = xmlReader.getAttributeValue(null, "s");
                    rowValues.add(getCellValue(cellType, cellStyleStr));
                }
            } else if (xmlReader.isEndElement() && xmlReader.getLocalName().equals("row")) {
                break;
            }
        }
        return rowValues.toArray(new String[rowValues.size()]);
    }

    /**
     * @param cellType the cell type
     * @param cellStyleStr the cell style
     * @return cell content the cell value
     * @throws XMLStreamException the XMLStreamException
     */
    private String getCellValue(final String cellType, final String cellStyleStr) throws XMLStreamException {
        String value = ""; // by default
        while (xmlReader.hasNext()) {
            xmlReader.next();
            if (xmlReader.isStartElement()) {
                if (xmlReader.getLocalName().equals("v")) {

                    if (cellType != null && cellType.equals("s")) {
                        int idx = Integer.parseInt(xmlReader.getElementText());
                        String s = stringsTable.getEntryAt(idx);
                        return new XSSFRichTextString(s).toString();
                    }

                    if (cellStyleStr != null) {
                        String cellValue = xmlReader.getElementText();
                        int styleIndex = Integer.parseInt(cellStyleStr);
                        XSSFCellStyle style = styles.getStyleAt(styleIndex);
                        short formatIndex = style.getDataFormat();
                        if (!isValidDouble(cellValue)) {
                            return cellValue;
                        }
                        double doubleVal = Double.valueOf(cellValue);
                        boolean isValidExcelDate = HSSFDateUtil.isInternalDateFormat(formatIndex);
                        Date date = null;
                        if (isValidExcelDate) {
                            date = HSSFDateUtil.getJavaDate(doubleVal);
                            String dateStr = dateAsString(date);
                            return dateStr;
                        }
                        if (!(doubleVal == Math.floor(doubleVal))) {
                            return Double.toString(doubleVal);
                        }
                        return cellValue;
                    }

                    else {
                        return xmlReader.getElementText();
                    }
                }
            } else if (xmlReader.isEndElement() && xmlReader.getLocalName().equals("c")) {
                break;
            }
        }
        return value;
    }

    /**
     * To check whether the incoming value can be used in the Double utility method Double.valueOf() to prevent
     * NumberFormatException.
     * @param stringVal - String to be validated.
     * @return - true if it is a valid String which can be passed into Double.valueOf method. <br/> For more information
     *         refer- <a>https://docs.oracle.com/javase/7/docs/api/java/lang/Double.html#valueOf(java.lang.String)</a>
     */
    private boolean isValidDouble(String stringVal) {
        final String Digits = "(\\p{Digit}+)";
        final String HexDigits = "(\\p{XDigit}+)";
        // an exponent is 'e' or 'E' followed by an optionally
        // signed decimal integer.
        final String Exp = "[eE][+-]?" + Digits;
        final String fpRegex = ("[\\x00-\\x20]*" + // Optional leading "whitespace"
                "[+-]?(" + // Optional sign character
                "NaN|" + // "NaN" string
                "Infinity|" + // "Infinity" string

                // A decimal floating-point string representing a finite positive
                // number without a leading sign has at most five basic pieces:
                // Digits . Digits ExponentPart FloatTypeSuffix
                //
                // Since this method allows integer-only strings as input
                // in addition to strings of floating-point literals, the
                // two sub-patterns below are simplifications of the grammar
                // productions from the Java Language Specification, 2nd
                // edition, section 3.10.2.

                // Digits ._opt Digits_opt ExponentPart_opt FloatTypeSuffix_opt
                "(((" + Digits + "(\\.)?(" + Digits + "?)(" + Exp + ")?)|" +

        // . Digits ExponentPart_opt FloatTypeSuffix_opt
                "(\\.(" + Digits + ")(" + Exp + ")?)|" +

                // Hexadecimal strings
                "((" +
                // 0[xX] HexDigits ._opt BinaryExponent FloatTypeSuffix_opt
                "(0[xX]" + HexDigits + "(\\.)?)|" +

                // 0[xX] HexDigits_opt . HexDigits BinaryExponent FloatTypeSuffix_opt
                "(0[xX]" + HexDigits + "?(\\.)" + HexDigits + ")" +

                ")[pP][+-]?" + Digits + "))" + "[fFdD]?))" + "[\\x00-\\x20]*");// Optional trailing "whitespace"

        if (Pattern.matches(fpRegex, stringVal))
            return true;
        else {
            return false;
        }

    }

    /**
     * Date as string.
     * @param date the date
     * @return the string
     */
    public static String dateAsString(final Date date) {
        String dateAsString = null;
        if (date != null) {
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
            dateAsString = sdf.format(date);
        }
        return dateAsString;
    }

    @Override
    protected final void finalize() throws Throwable {
        if (opcPkg != null) {
            opcPkg.close();
        }
        super.finalize();
    }

}

I want the Empty row also to be part of the List of String array. Just that the String array will be empty.

If it is in the Excel like:

First Row

<< No Data >>

Third Row

then the List should be

dataFromFile
        0 ->[First Row]
        1 ->[]
        2 ->[Third Row]

Upvotes: 2

Views: 11320

Answers (1)

jmarkmurphy
jmarkmurphy

Reputation: 11473

Here is what the Busy Developers Guide on the POI site says about reading sheets with missing rows:

for (int rowNum = rowStart; rowNum < rowEnd; rowNum++) {
   Row r = sheet.getRow(rowNum);
   if (r == null) {
      // This whole row is empty
      // Handle it as needed
      continue;
   }

   int lastColumn = Math.max(r.getLastCellNum(), MY_MINIMUM_COLUMN_COUNT);

   for (int cn = 0; cn < lastColumn; cn++) {
      Cell c = r.getCell(cn, Row.RETURN_BLANK_AS_NULL);
      if (c == null) {
         // The spreadsheet is empty in this cell
      } else {
         // Do something useful with the cell's contents
      }
   }
}

You really can get a lot of your answers there. You shouldn't ever have to parse the XML directly. Even things that are missing from the high level usermodel usually have a way to get at it through the CT classes generated by XMLBeans.

Upvotes: 7

Related Questions