nigelg
nigelg

Reputation: 161

Insert multiple copied paragraphs in XWPFDocument

I am trying to copy paragraphs of a XWPFDocument using Apache POI. Since POI has no method to insert a pre-made paragraph at an arbitrary point, I've read plenty of answers suggesting to first insert a throwaway paragraph using insertNewParagraph(), then replace the temporary paragraph by the one I actually want with setParagraph(). This is further complicated by that insertNewParagraph can't just take an input which is the desired index into the body's list of elements (like how XWPFTable.addRow(row,pos) works), and must pass it an XmlCursor.

TestIn.docx I created as a a test with 6 paragraphs A, B, C, D, E, F.

import java.io.FileInputStream;
import java.io.FileOutputStream;

import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.xmlbeans.XmlCursor;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;

public class ParagraphIssue
{
    public void debugElement (IBodyElement elem, StringBuilder s, XWPFParagraph a, XWPFParagraph b, XWPFParagraph c, XWPFParagraph d, XWPFParagraph e, XWPFParagraph f,
        XWPFParagraph t1, XWPFParagraph r1, XWPFParagraph t2, XWPFParagraph r2)
    {
        if (s.length () > 0) s.append (" ");
        if (elem == a) s.append ("A");
        else if (elem == b) s.append ("B");
        else if (elem == c) s.append ("C");
        else if (elem == d) s.append ("D");
        else if (elem == e) s.append ("E");
        else if (elem == f) s.append ("F");
        else if (elem == t1) s.append ("T1");
        else if (elem == r1) s.append ("R1");
        else if (elem == t2) s.append ("T2");
        else if (elem == r2) s.append ("R2");
        else s.append ("U");
    }
    
    public void debug (XWPFDocument doc, XWPFParagraph a, XWPFParagraph b, XWPFParagraph c, XWPFParagraph d, XWPFParagraph e, XWPFParagraph f,
        XWPFParagraph t1, XWPFParagraph r1, XWPFParagraph t2, XWPFParagraph r2)
    {
        StringBuilder s = new StringBuilder ();
        for (IBodyElement elem : doc.getBodyElements ())
            debugElement (elem, s, a, b, c, d, e, f, t1, r1, t2, r2);
        System.out.println("Elements: " + s);
        
        s = new StringBuilder ();
        for (XWPFParagraph para : doc.getParagraphs ())
            debugElement (para, s, a, b, c, d, e, f, t1, r1, t2, r2);
        System.out.println("Paragraphs: " + s);
    }
    
    public void run (XWPFDocument doc, int insertionPoint)
    {
        XWPFParagraph paraA = doc.getParagraphs().get(0);
        XWPFParagraph paraB = doc.getParagraphs().get(1);
        XWPFParagraph paraC = doc.getParagraphs().get(2);
        XWPFParagraph paraD = doc.getParagraphs().get(3);
        XWPFParagraph paraE = doc.getParagraphs().get(4);
        XWPFParagraph paraF = doc.getParagraphs().get(5);
        
        System.out.println ("--- Document initial state ---");
        debug (doc, paraA, paraB, paraC, paraD, paraE, paraF, null, null, null, null);

        // Clone the first paragraph
        XWPFParagraph cloneThis = (XWPFParagraph) doc.getBodyElements ().get (0);
        XWPFParagraph clonedPara = new XWPFParagraph ((CTP) cloneThis.getCTP ().copy (), doc);
        
        // Add new paragraph before the final paragraph
        XWPFParagraph insertBeforePara = (XWPFParagraph) doc.getBodyElements ().get (insertionPoint);
        XmlCursor cursor = insertBeforePara.getCTP ().newCursor ();
        
        XWPFParagraph newPara = doc.insertNewParagraph (cursor);
        newPara.insertNewRun (0).setText ("this should get replaced");
        
        System.out.println ("--- Insert 1st temporary para before F ---");
        debug (doc, paraA, paraB, paraC, paraD, paraE, paraF, newPara, clonedPara, null, null);
        
        int newParaIndex = 0;
        for (IBodyElement elem : doc.getBodyElements ())
        {
            if (elem == newPara)
                break;
            else if (elem.getElementType () == newPara.getElementType ())
                newParaIndex++;
        }
        
        System.out.println ("1st temporary para is at index " + newParaIndex);      // 5, as expected
        
        // Now replace the added paragraph with the cloned one
        doc.setParagraph (clonedPara, newParaIndex);
        System.out.println ("--- Replace 1st temporary para ---");
        debug (doc, paraA, paraB, paraC, paraD, paraE, paraF, newPara, clonedPara, null, null);
        
        // Do exactly the same thing again to clone the second paragraph
        XWPFParagraph cloneThis2 = (XWPFParagraph) doc.getBodyElements ().get (1);
        XWPFParagraph clonedPara2 = new XWPFParagraph ((CTP) cloneThis2.getCTP ().copy (), doc);
        
        XWPFParagraph insertBeforePara2 = (XWPFParagraph) doc.getBodyElements ().get (insertionPoint + 1);
        XmlCursor cursor2 = insertBeforePara2.getCTP ().newCursor ();
        
        XWPFParagraph newPara2 = doc.insertNewParagraph (cursor2);
        newPara2.insertNewRun (0).setText ("this should get replaced too");

        System.out.println ("--- Insert 2nd temporary para before F ---");
        debug (doc, paraA, paraB, paraC, paraD, paraE, paraF, newPara, clonedPara, newPara2, clonedPara2);
        
        int newParaIndex2 = 0;
        for (IBodyElement elem : doc.getBodyElements ())
        {
            if (elem == newPara2)
                break;
            else if (elem.getElementType () == newPara2.getElementType ())
                newParaIndex2++;
        }
        
        System.out.println ("2nd temporary para is at index " + newParaIndex2);
        
        doc.setParagraph (clonedPara2, newParaIndex2);      // So then this replaces the wrong paragraph
        System.out.println ("--- Replace 2nd temporary para ---");
        debug (doc, paraA, paraB, paraC, paraD, paraE, paraF, newPara, clonedPara, newPara2, clonedPara2);
    }
    
    public final static void main (final String [] args)
    {
        try (FileInputStream in = new FileInputStream ("W:\\TestIn.docx"))
        {
            XWPFDocument doc = new XWPFDocument (in);
            new ParagraphIssue ().run (doc, 5);
            
            try (FileOutputStream out = new FileOutputStream ("W:\\TestOut.docx"))
            {
                doc.write (out);
            }
        }
        catch (Exception e)
        {
            e.printStackTrace ();
        }
    }
}

A lot is debug code so I can get output that shows exactly what's happening:

--- Document initial state ---
Elements: A B C D E F
Paragraphs: A B C D E F
--- Insert 1st temporary para before F ---
Elements: A B C D E T1 F
Paragraphs: A B C D E T1 F
1st temporary para is at index 5 - perfect so far
--- Replace 1st temporary para ---
Elements: A B C D E T1 F
Paragraphs: A B C D E R1 F - The list of paragraphs has the replacement paragraph, but the list of elements still has the temporary paragraph
--- Insert 2nd temporary para before F ---
Elements: A B C D E T1 T2 F
Paragraphs: T2 A B C D E R1 F - now the 2nd temporary paragraph has gone at the front of the list; its in the correct place in the list of elements
2nd temporary para is at index 6
--- Replace 2nd temporary para ---
Elements: A B C D E T1 T2 F
Paragraphs: T2 A B C D E R2 F - List of elements still contains temporary paragraphs; List of paragraphs has 2nd paragraph in wrong place

Amazingly, the saved Word doc actually looks correct, but I don't understand how when neither list looks correct.

As far as finding where to do the insert goes, so far I could've used int newParaIndex = doc.getPosOfParagraph (newPara);. Problem with this comes when you add tables into the mix. Now I edited the source doc and inserted a table so the list of elements now looks like A, B, (table), C, D, E, F and change insertionPoint to 6 accordingly.

Now you can no longer use doc.getPosOfParagraph () as this returns the index of the paragraph in the list of elements (including tables) but setParagraph needs the index of the paragraph in the list of paragraph (excluding tables). Using doc.getParagraphPos() to compenstate for this returns 0 for the 2nd inserted temporary paragraph because as you can see in the output above, that's literally where it is. So I worked around this by searching only the paragraphs of the elements list, as you can see in the code.

Running again with the table added (this is the 'U' in the debug output):

--- Document initial state ---
Elements: A B U C D E F
Paragraphs: A B C D E F
--- Insert 1st temporary para before F ---
Elements: A B U C D E T1 F
Paragraphs: A B C D E T1 F
--- Replace 1st temporary para ---
Elements: A B U C D E T1 F
Paragraphs: A B C D E R1 F
--- Insert 2nd temporary para before F ---
Elements: A B U C D E T1 T2 F
Paragraphs: T2 A B C D E R1 F
2nd temporary para is at index 6
--- Replace 2nd temporary para ---
Elements: A B U C D E T1 T2 F
Paragraphs: T2 A B C D E R2 F

Again this does actually generate the correct output in the saved doc. My questions are:

  1. Is there a better way to do this that fixes the screwyness of temporary paragraphs being replaced in one list but not the other, and of the 2nd temporary paragraph showing up at the front of the list? For example should I re-use the same XmlCursor to insert the 2nd temporary paragraph? Should I make all the temporary paragraphs in one go and then replace them all in one hit afterwards rather than doing one at a time? Would anything like this help?
  2. When I try this approach in our real app, Word complains the document is corrupted. It offers to attempt to recover it, and if I click Yes then it opens and the content and all the copied paragraphs all look correct, but the odd behaviour here is causing the corrupt doc warning.

Upvotes: 1

Views: 466

Answers (1)

Bart van Oort
Bart van Oort

Reputation: 350

I was running into the same problem while trying to duplicate paragraphs in a Word document with Apache POI. I found that the solution to your first question is indeed to make all the temporary paragraphs (i.e. all calls to insertNewParagraph) in one go, then replace them all with the to-be-duplicated content after (i.e. all calls to setParagraph).

I ended up with the following working solution:

/** Utilities for managing paragraphs in Word documents. */
public class Paragraphs {
  /** Copy the given paragraph and its contents to a new paragraph in the document. */
  public static List<XWPFParagraph> duplicate(XWPFParagraph paragraph, int times) {
    // **Implementation note:**
    // Due to some weird behaviour with Apache POI's insertNewParagraph and setParagraph as described in
    // https://stackoverflow.com/questions/75289475/insert-multiple-copied-paragraphs-in-xwpfdocument
    // we have to insert the new paragraphs first, then copy the contents of the original paragraph to them.
    // We cannot insert and copy in the same loop, because insertNewParagraph will insert subsequent paragraphs at
    // position 0, while getPosOfParagraph still returns the intended position, thus causing only the first duplication
    // to succeed, while the rest only result in more empty paragraphs at the start of document.getParagraphs().

    var document = paragraph.getDocument();

    var newParagraphs = new ArrayList<XWPFParagraph>();
    try (var cursor = paragraph.getCTP().newCursor()) {
      for (int i = 0; i < times; i++) {
        var newParagraph = document.insertNewParagraph(cursor);
        newParagraphs.add(newParagraph);

        while (cursor.toNextToken() != TokenType.START);
      }
    }

    for (int i = 0; i < newParagraphs.size(); i++) {
      // copy the contents of the original paragraph to a new paragraph and overwrite the empty paragraph
      var newParagraphPosition = document.getPosOfParagraph(newParagraphs.get(i));
      var newParagraph = new XWPFParagraph((CTP) paragraph.getCTP().copy(), document);
      document.setParagraph(newParagraph, newParagraphPosition);

      newParagraphs.set(i, newParagraph); // replace the empty paragraph with the copied one
    }

    return newParagraphs;
  }
}

The cause of this strange behaviour, as far as I can tell, is indeed related to the TODO comment in setParagraph as Axel Richter points out in his comment. To demonstrate this behaviour, I wrote these passing tests:

  @Test
  void demoApachePoiBugInvalidBehaviour() {
    var document = new XWPFDocument();
    var paragraph1 = document.createParagraph();
    paragraph1.createRun().setText("Hello World!");

    var paragraph2 = document.insertNewParagraph(paragraph1.getCTP().newCursor());
    paragraph2.createRun().setText("Hello People!");

    assertEquals("Hello People!", document.getParagraphs().get(0).getText());
    assertEquals("Hello World!", document.getParagraphs().get(1).getText());

    var paragraph2Position = document.getPosOfParagraph(paragraph2);
    var newParagraph2 = new XWPFParagraph((CTP) paragraph1.getCTP().copy(), document);
    document.setParagraph(newParagraph2, paragraph2Position);

    assertEquals("Hello World!", document.getParagraphs().get(0).getText());
    assertEquals("Hello World!", document.getParagraphs().get(1).getText());

    // so far so good.

    // However, inserting a new paragraph at the position of paragraph1 now results in the new paragraph being inserted
    // at position 0, while it should be inserted at position 1 (one before last).

    var paragraph3 = document.insertNewParagraph(paragraph1.getCTP().newCursor());
    paragraph3.createRun().setText("Hello Opinity!");

    assertEquals("Hello Opinity!", document.getParagraphs().get(0).getText());
    assertEquals("Hello World!", document.getParagraphs().get(1).getText());
    assertEquals("Hello World!", document.getParagraphs().get(2).getText());

    var paragraph3Position = document.getPosOfParagraph(paragraph3);
    var newParagraph3 = new XWPFParagraph((CTP) paragraph1.getCTP().copy(), document);
    document.setParagraph(newParagraph3, paragraph3Position);

    assertThrows(XmlValueDisconnectedException.class, () -> document.getParagraphs().get(0).getText());
    assertEquals("Hello World!", document.getParagraphs().get(1).getText());
    assertEquals("Hello World!", document.getParagraphs().get(2).getText());
  }

As opposed to the expected behaviour when calling insertNewParagraph and setParagraph in the right order:

  @Test
  void demoApachePoiBugValidBehaviour() {
    var document = new XWPFDocument();
    var paragraph1 = document.createParagraph();
    paragraph1.createRun().setText("Hello World!");

    var paragraph2 = document.insertNewParagraph(paragraph1.getCTP().newCursor());
    paragraph2.createRun().setText("Hello People!");

    var paragraph3 = document.insertNewParagraph(paragraph1.getCTP().newCursor());
    paragraph3.createRun().setText("Hello Opinity!");

    assertEquals("Hello People!", document.getParagraphs().get(0).getText());
    assertEquals("Hello Opinity!", document.getParagraphs().get(1).getText());
    assertEquals("Hello World!", document.getParagraphs().get(2).getText());

    var paragraph2Position = document.getPosOfParagraph(paragraph2);
    var newParagraph2 = new XWPFParagraph((CTP) paragraph1.getCTP().copy(), document);
    document.setParagraph(newParagraph2, paragraph2Position);

    assertEquals("Hello World!", document.getParagraphs().get(0).getText());
    assertEquals("Hello Opinity!", document.getParagraphs().get(1).getText());
    assertEquals("Hello World!", document.getParagraphs().get(2).getText());

    var paragraph3Position = document.getPosOfParagraph(paragraph3);
    var newParagraph3 = new XWPFParagraph((CTP) paragraph1.getCTP().copy(), document);
    document.setParagraph(newParagraph3, paragraph3Position);

    assertEquals("Hello World!", document.getParagraphs().get(0).getText());
    assertEquals("Hello World!", document.getParagraphs().get(1).getText());
    assertEquals("Hello World!", document.getParagraphs().get(2).getText());
  }

Upvotes: 0

Related Questions