Extract text from a Text Box of a Word document using Office JS Api

Question

I am having some TextBoxes ( Shape > Text Box) inside a word document. The document is a CV template which includes a lot of them. I would like to select all Textboxes of the document, extract text, remove the text boxes and inject the extracted text. I have tried

 const range =  context.document.getSelection();
 range.load("text");

and then sync the context so that I can get the text.

Konstantinos Cheilakos · Accepted Answer

I finally took the following workaround. It is and fast and works nicely in both Windows & macOS

Get OOXML of the document's body
Parse OOXL.value & Generate an xmlDocument (xmlDoc)
Detect existing Textboxes & Shapes that contain text: getElementsByTagName("wps:wsp")
Extract text from (3)
Generate a simple xml TextElement with text extracted
Replace (3) with (5)
Serialize to xmlString the updated xmlDoc and get the updated OOXML.value

Insert updated OOXML.value to document replacing the existing one

Word.run(function (context) {

     //Select document body and extract OOXML 
     const body = context.document.body;
     const ooxml = body.getOoxml();

     return context.sync().then(function () {

         //Initialize DOM Parser
         const parser = new DOMParser();
         const xmlDoc = parser.parseFromString(ooxml.value, "text/xml");

         //Get all runs
         const rows = xmlDoc.getElementsByTagName("w:r");
         for (let j = 0; j < rows.length; j++) {
             const row = rows[j];
             const rowHasTextBox = row.getElementsByTagName("wps:txbx").length > 0;
             //If no textbox, shape, wordart exists skip current run
             if (!rowHasTextBox) continue;

             //Select textbox, shape, wordart and get paragraphs
             const textboxContainer = row.getElementsByTagName("wps:txbx")[0];
             const paragraphs = textboxContainer.getElementsByTagName("w:p");

             // Create a new run which will replace the existing run
             const newRow = xmlDoc.createElement("w:r");
             const breakLine = xmlDoc.createElement("w:br");
             //Append breakline and "{{"
             newRow.appendChild(breakLine);
             newRow.appendChild(startRow);

             for (let p = 0; p < paragraphs.length; p++) {
                 //Check whether paragrapj has text
                 const paragraphHasText = paragraphs[p].getElementsByTagName("w:t").length > 0;
                 if (!paragraphHasText) continue;
                 //Extract text
                 let textExtracted = "";
                 const textBoxTexts = paragraphs[p].getElementsByTagName("w:t");
                 for (let k = 0; k < textBoxTexts.length; k++) {
                     const textBoxText = textBoxTexts[k].innerHTML;
                     textExtracted = textExtracted + textBoxText;
                     textExtracted = textExtracted + " ";
                 }
                  // Create a temp run which will hold the etxtracted text
                 const tempRow = xmlDoc.createElement("w:r");
                 const newText = xmlDoc.createElement('w:t');
                 newText.setAttribute("xml:space", "preserve");
                 newText.innerHTML = textExtracted;
                 textExtracted = "";
                 tempRow.appendChild(newText);
                 newRow.appendChild(tempRow);
                 const breakLine = xmlDoc.createElement("w:br");
                 newRow.appendChild(breakLine);
             }


             //Replace existing run with the new one
             row.replaceWith(newRow);
         }
         //Serialize dom , clear body and replace OOXML
         const serializedXML = new XMLSerializer().serializeToString(xmlDoc.documentElement);
         body.clear();
         return context.sync().then(function () {
             body.insertOoxml(serializedXML, Word.InsertLocation.replace);
             console.log('done');
         });
     });
 })
 .catch(error => {
     console.log('Error: ', error);
     resolve(false);
 });

Extract text from a Text Box of a Word document using Office JS Api

Answers (1)

Related Questions