shravan
shravan

Reputation: 25

write data into excel file after extracting the paragraphs/Strings from HWPFDocument(.doc files)

Here my code helps me to extract the data from .doc files into paragraphs and specific string search as well.i can take it manual output using eclipse run configuration. but 1) i wanted it to direct output into excel file where it is .doc file.2) output must be specified cells only.

public static void readParagraphs(HWPFDocument docx) throws Exception{
    we = new WordExtractor(docx);
    String[] paragraphs = we.getParagraphText();     
    // To fetch for mode
    for(String p: paragraphs){
    if(p.startsWith("MODE"))
    System.out.println("       "+p);
    }
    for(String type: paragraphs ){
    if(type.startsWith("TYPE"))
    System.out.format("       "+type);
    }
    }

Expected output:

S.no  | Doc name  | Title    | mode            | type
=====================================================================
1     | laptop    | A12345   | abcd 123456     | efghij A12345/123456
2     | laptop    | A12346   | abcd 123457     | efghij A12345/123457
3     | laptop    | A12347   | abcd 123458     | efghij A12345/123458

here you can see piece of my code of HSSFWorkbook.

HSSFWorkbook workbook = new HSSFWorkbook();
HSSFSheet sheet = workbook.createSheet("firstsheet");
Row row1 = sheet.createRow((short) 0);
row1.createCell(0).setCellValue("S.NO");
row1.createCell(1).setCellValue("DOC NAME");
row1.createCell(2).setCellValue("TITLE");
row1.createCell(3).setCellValue("MODE");
row1.createCell(4).setCellValue("TYPE");
Row row2 = sheet.createRow(rowNum++);
row2.createCell(3).setCellValue(" "+mode);
row2.createCell(4).setCellValue(" "+type);
Row row3 = sheet.createRow(rowNum++);
row3.createCell(3).setCellValue(" "+mode);
row3.createCell(4).setCellValue(" "+type);

below table available on sheet 1, header file. need to extract only ' A12345'

 =====================================
 |  xx |       A12345         |xx    |     
 =====================================

below table available either sheet 2 or 3-6. depends on each doc.

 --------------------------------------------.--------------------
|MODE :  Abcde 123456 efghit 234567  sddsldjf 232132             |
|----------------------------------------------------------------|  
|INFO   |TYPE : efghij A12345/123456 dsflsdjflsd B22323/&123456  |
|       |xxxxxxxxxxxxxxxxxalphanumericxxxxxxxxxxxxxxxxxxxxxxxxxx |
 -----------------------------------------------------------------

if(p.startsWith("MODE"))// this method helps to print the " MODE : Abcde 123456 efghit 234567 sddsldjf 232132 " if(type.startsWith("TYPE")) // this method helps to print "TYPE : efghij A12345/123456 dsflsdjflsd B22323/&123456 " but some docs there is no 'TYPE' so,i would choose two options either to find the next lines until 'JUSTIFICATION' from 'MODE' or pattern recognition to fetch 'TYPE' lines. seek for suggestions.

below table available after above table

 -----------------------------------------------------------
|JUSTIFICATION                                              |   
|-----------------------------------------------------------
|   |   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx    |
|   |   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx alphanumericxxx    |
 -----------------------------------------------------------


import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.sl.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.DataFormat;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import java.util.ArrayList;
import java.util.List;
import java.io.*;
public class ReadDocFileFromJava {

    public static int test = 0;

    private static WordExtractor ex;

        public static void main(String[] args) throws IOException  {

            List<String> fileName=new ArrayList<String>();  


        fileName.add("C:\\1200.doc");
        fileName.add("C:\\1210.doc");
        fileName.add("C:\\1211.doc");
        fileName.add("C:\\1212.doc");
        fileName.add("C:\\1213.doc");
                // document 2 
        fileName.add("C:\\1214.doc");
        fileName.add("C:\\1215.doc");
        fileName.add("C:\\1216.doc");
        fileName.add("C:\\1217.doc");


        for(int i=0;i<fileName.size();i++){

               readMyDocument(fileName.get(i));
        }}

    public static void readMyDocument(String i){
        POIFSFileSystem fs = null;
        try {
            fs = new POIFSFileSystem(new FileInputStream(i));

            HWPFDocument docx = new HWPFDocument(fs);
            ex = new WordExtractor(docx);
           readParagraphs(docx);
           fs.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
public static void readParagraphs(HWPFDocument docx) throws Exception{

        int a =0,b=0,c=0,d=0,celIte=0, celIte2=0,link=0;
        ex = new WordExtractor(docx);
        String[] paragraphs = ex.getParagraphText();

        HSSFWorkbook workbook = new HSSFWorkbook();
        HSSFSheet sheet = workbook.createSheet("firstsheet");
        Row row0 = sheet.createRow(0);
        row0.createCell(0).setCellValue("S.NO");
        row0.createCell(1).setCellValue("DOC NAME");
        row0.createCell(2).setCellValue("TITLE");
        row0.createCell(3).setCellValue("MODE");
        row0.createCell(4).setCellValue("TYPE");
          for(int i=1;i<=10;i++){
                        Row row1 = sheet.createRow(i);
                for(int j=0;j<=0;j++){
                    Cell cell_10 =row1.createCell(j);
                    do{
                        cell_10.setCellValue(celIte);
                        celIte++;
                    }while(celIte<1);

                    for(int k=3;k<=3;k++){

                    Cell cell_12 = row1.createCell(k);
                    for(String p: paragraphs){


                          if(p.startsWith("MODE"))  
                         cell_12.setCellValue(""+p);
        }   }
      workbook.write(new FileOutputStream("C:\\output.xls"));

        workbook.close();

}
                }
        }}

S.NO  | DOC NAME | TITLE | MODE | TYPE
==========================================
1                          XXXX
2                          XXXX 
3                          XXXX
4                          XXXX
5                          XXXX
6                          XXXX
7                          XXXX
8                          XXXX
9                          XXXX

Upvotes: 0

Views: 431

Answers (1)

XtremeBaumer
XtremeBaumer

Reputation: 6435

modified your code a bit, but its surely not working yet as i am missing information. maybe you can try to code the rest

public class ReadDocFileFromJava {

    public static int test = 0;

    private static WordExtractor ex;

    private static List<String[]> allParagraphs;

    public static void main(String[] args) throws IOException {

        List<String> fileName = new ArrayList<String>();

        fileName.add("C:\\1200.doc");
        fileName.add("C:\\1210.doc");
        fileName.add("C:\\1211.doc");
        fileName.add("C:\\1212.doc");
        fileName.add("C:\\1213.doc");
        // document 2
        fileName.add("C:\\1214.doc");
        fileName.add("C:\\1215.doc");
        fileName.add("C:\\1216.doc");
        fileName.add("C:\\1217.doc");

        for (int i = 0; i < fileName.size(); i++) {
            allParagraphs.add(readMyDocument(fileName.get(i)));
        }

    }

    public static String[] readMyDocument(String i) {
        POIFSFileSystem fs = null;
        String[] paragraph;
        try {
            fs = new POIFSFileSystem(new FileInputStream(i));
            HWPFDocument docx = new HWPFDocument(fs);
            ex = new WordExtractor(docx);
            paragraph = ex.getParagraphText();
            fs.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return paragraph;

    }

    public static void readParagraphs(List<String[]> paragraphs) throws Exception {

        int a = 0, b = 0, c = 0, d = 0, celIte = 0, celIte2 = 0, link = 0;
        HSSFWorkbook workbook = new HSSFWorkbook();
        HSSFSheet sheet = workbook.createSheet("firstsheet");
        Row row = sheet.createRow(0);
        row.createCell(0).setCellValue("S.NO");
        row.createCell(1).setCellValue("DOC NAME");
        row.createCell(2).setCellValue("TITLE");
        row.createCell(3).setCellValue("MODE");
        row.createCell(4).setCellValue("TYPE");
        for (int i = 1; i <= 10; i++) {
            row = sheet.createRow(i);
            for (int j = 0; j <= 0; j++) {
                Cell cell_10 = row.createCell(j);
                do {
                    cell_10.setCellValue(celIte);
                    celIte++;
                } while (celIte < 1);

                for (int k = 3; k <= 3; k++) {

                    Cell cell_12 = row.createCell(k);
                    for (String p : paragraphs) {

                        if (p.startsWith("MODE"))
                            cell_12.setCellValue("" + p);
                    }
                }
                workbook.write(new FileOutputStream("C:\\output.xls"));

                workbook.close();

            }
        }
    }
}

Upvotes: 0

Related Questions