Reputation: 3
I´m using Apache poi (XWPF) with Springboot in java 11.
I need to extract the section number 2 (title and content) from a word document with the follow numerated list:
I would like to know what is the best way to get only section 2 and its content to create a new word document with only that part.
private void extractAllParagraphs(){
//Get documet
XWPFDocument doc = new XWPFDocument(OPCPackage.open("path..."));
String textPart = "";
// loop all paragraphs
List<XWPFParagraph> xwpfParagraphList = doc.getParagraphs();
for (XWPFParagraph p : doc.getParagraphs()) {
//Get paragraph runs
List<XWPFRun> runs = p.getRuns();
//Loop runs of the paragraph
for(int i = 0; i<runs.size(); i++) {
textPart += runs.get(i).toString();
}
System.out.println(textPart);
}
}
thx.
Upvotes: 0
Views: 1418
Reputation: 61945
XWPF
of apache poi
does not support numbering in Word very well. So handling numbering is not really straight forward.
In Word numbered paragraphs have a num-id and the numbering level set in document. This num-id refers to a numbering in a separate numbering document part. There the numbering type (decimal, letter, roman, ...) and the numbering format is defined. The actual numbering of the paragraphs is determined by this num-id, numbering level, numbering type, numbering format and the count of paragraphs having the same num-id before in document. So it is really complex to manage numbering while reading a Word document.
The following working draft shows one example for how to manage numbering while reading a Word document using apache poi
. It is a working draft to show the principle using as less code as possible. It uses memory structure for storing the numbering level counter and the the previous numbering level in document. The code is commented additionally to show what it does.
import java.io.FileInputStream;
import org.apache.poi.xwpf.usermodel.*;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Iterator;
import java.math.BigInteger;
public class WordReader {
//memory structure for storing the numbering level counter
private Map<Integer, Map<Integer, Integer>> numIDLvlCnt = new HashMap<Integer, Map<Integer, Integer>>();
//memory structure for storing the previous numbering level
private Map<Integer, Integer> numIDPrevNumIlv = new HashMap<Integer,Integer>();
private StringBuilder content = new StringBuilder();
private void traverseBodyElements(List<IBodyElement> bodyElements, boolean crlf) throws Exception {
for (IBodyElement bodyElement : bodyElements) {
if (bodyElement instanceof XWPFParagraph) {
XWPFParagraph paragraph = (XWPFParagraph)bodyElement;
//System.out.println(paragraph);
//ToDo: Do something with paragraph.
String no = "";
if (paragraph.getNumID() != null) { //if paragraph has numbering
no = getCurrentNumber(paragraph);
}
//print paragraph, if numbered then with leading number
content.append("<p>");
if (no.length() > 0) content.append(no + " ");
content.append(paragraph.getText());
content.append("</p>");
if (crlf) content.append("\r\n");
} else if (bodyElement instanceof XWPFTable) {
XWPFTable table = (XWPFTable)bodyElement;
//System.out.println(table);
content.append("<table>");
content.append("\r\n");
traverseTableRows(table.getRows());
content.append("</table>");
content.append("\r\n");
} // ToDo: else ...
}
}
private void traverseTableRows(List<XWPFTableRow> tableRows) throws Exception {
for (XWPFTableRow tableRow : tableRows) {
//System.out.println(tableRow);
content.append("<tr>");
traverseTableCells(tableRow.getTableICells());
content.append("</tr>");
content.append("\r\n");
}
}
private void traverseTableCells(List<ICell> tableICells) throws Exception {
for (ICell tableICell : tableICells) {
if (tableICell instanceof XWPFTableCell) {
XWPFTableCell tableCell = (XWPFTableCell)tableICell;
//System.out.println(tableCell);
content.append("<td>");
traverseBodyElements(tableCell.getBodyElements(), false);
content.append("</td>");
} // ToDo: else ...
}
}
//set numbering level counter for current numbering ID and numbering level
private void setNumIDLvlCnt(Integer numID, Integer numIlvl) {
if (numID != null) {
//get level counter for numbering ID
Map<Integer, Integer> lvlCnt = numIDLvlCnt.get(numID);
if (lvlCnt == null) { //if there is no level counter, create a new one
lvlCnt = new HashMap<Integer, Integer>();
numIDLvlCnt.put(numID, lvlCnt);
}
Integer prevNumIlv = numIDPrevNumIlv.get(numID);
if (prevNumIlv == null) {
prevNumIlv = 0;
numIDPrevNumIlv.put(numID, prevNumIlv);
}
if (numIlvl != null) {
//if this level is lower than the previous one, then all deeper level counters needs starting new
if (numIlvl < prevNumIlv) {
/*
for(Iterator<Integer> iterator = lvlCnt.keySet().iterator(); iterator.hasNext(); ) {
Integer ilvl = iterator.next();
if (ilvl > numIlvl) {
iterator.remove();
}
}
*/
lvlCnt.keySet().removeIf(ilvl -> ilvl > numIlvl);
}
//get current counter for level
Integer cnt = lvlCnt.get(numIlvl);
if (cnt == null) { //if there is no counter, set 0
lvlCnt.put(numIlvl, 0);
}
cnt = lvlCnt.get(numIlvl);
lvlCnt.put(numIlvl, cnt + 1); //count up 1
prevNumIlv = numIlvl; //set this level to be the previous level
numIDPrevNumIlv.put(numID, prevNumIlv);
}
}
//System.out.println(numIDLvlCnt);
//System.out.println(numIDPrevNumIlv);
}
//get formatted number from number format and level counter
private String getNoFromCount(String numFmt, Integer cnt) {
String no = "";
if ("DECIMAL".equalsIgnoreCase(numFmt)) {
no = String.valueOf(cnt);
} else if ("LOWERLETTER".equalsIgnoreCase(numFmt)) {
no = Character.toString(96 + cnt); //should be done better
} else if ("LOWERROMAN".equalsIgnoreCase(numFmt)) {
String[] romans = new String[]{"", "i", "ii", "iii", "iv", "v"};
if (cnt < romans.length) no = romans[cnt]; //should be done better
} else if ("UPPERROMAN".equalsIgnoreCase(numFmt)) {
String[] romans = new String[]{"", "I", "II", "III", "IV", "V"};
if (cnt < romans.length) no = romans[cnt]; //should be done better
} //ToDo: else ...
return no;
}
//get current number from paragraph
private String getCurrentNumber(XWPFParagraph paragraph) {
String no = "";
BigInteger numStartOverride = paragraph.getNumStartOverride(); //ToDo: to take into account
//System.out.println(numStartOverride);
//get numbering format
String numFmt = paragraph.getNumFmt(); //decimal, lowerletter, roman, ..
//get numbering ID
BigInteger numID = paragraph.getNumID();
//get current numbering level
BigInteger numIlvl = paragraph.getNumIlvl();
//set numbering level counter for current numbering ID and numbering level
setNumIDLvlCnt(numID.intValue(), numIlvl.intValue());
//get level counter for this numbering ID
Map<Integer, Integer> lvlCnt = numIDLvlCnt.get(numID.intValue());
//get numbering level text
String numLevelText = paragraph.getNumLevelText(); // %1.%2.%3...
no = numLevelText;
for (Integer ilvl : lvlCnt.keySet()) {
int i = ilvl + 1;
//replace the placeholders %1, %2, %3, ... with formatted number from number format and level counter
no = no.replace("%"+i, getNoFromCount(numFmt, lvlCnt.get(ilvl)));
}
return no;
}
public void read(String inFilePath) throws Exception {
XWPFDocument document = new XWPFDocument(new FileInputStream(inFilePath));
traverseBodyElements(document.getBodyElements(), true);
document.close();
System.out.println(content);
}
public static void main(String[] args) throws Exception {
String inFilePath = "./WordDocument.docx";
WordReader reader = new WordReader();
reader.read(inFilePath);
}
}
Note: As your question asks about the best way: This shows one way. Whether it is "the best" one is not answerable here. And questions about "the best way" always are opinion based and so not are questions to ask here. See https://stackoverflow.com/help/on-topic.
Upvotes: 2