Reputation: 1121
i want to extract text from docx file, i have tried using mammoth
var mammoth = require("mammoth");
mammoth.extractRawText({path: "./doc.docx"})
.then(function(result){
var text = result.value; // The raw text
//this prints all the data of docx file
console.log(text);
for (var i = 0; i < text.length; i++) {
//this prints all the data char by char in separate lines
console.log(text[i]);
}
var messages = result.messages;
})
.done();
but the problem here is that in this for loop i want data line by line instead of char by char, please help me here or is there any other method that you know?
Upvotes: 5
Views: 18386
Reputation: 26
Keep in mind, mammoth
doesn't support extracting text from DocX Header and Footer (GitHub issue)
If you need to extract text from there as well, I can recommend using the word-extractor
package (link).
Here is an example of usage:
const WordExtractor = require('word-extractor');
async function getTextFromBuffer(buffer) {
const extractor = new WordExtractor();
const document = await extractor.extract(buffer);
const bodyText = document.getBody();
const headerText = document.getHeaders({ includeFooters: false });
const footerText = document.getFooters();
return `${headerText}\n${bodyText}\n${footerText}`;
}
Upvotes: 0
Reputation: 37095
One method is to fetch the whole text and then split by '\n'
:
import superagent from 'superagent';
import mammoth from 'mammoth';
const url = 'http://www.ojk.ee/sites/default/files/respondus-docx-sample-file_0.docx';
const main = async () => {
const response = await superagent.get(url)
.parse(superagent.parse.image)
.buffer();
const buffer = response.body;
const text = (await mammoth.extractRawText({ buffer })).value;
const lines = text.split('\n');
console.log(lines);
};
main().catch(error => console.error(error));
Upvotes: 4
Reputation: 424
You can use any-text
Usage is simeple:
var reader = require('any-text');
reader.getText(`path-to-file`).then(function (data) {
console.log(data);
});
Upvotes: 2
Reputation: 163
var mammoth = require("mammoth");
var path = require("path");
var filePath = path.join(__dirname,'./doc.docx');
mammoth.extractRawText({path: filePath})
.then(function(result){
var text = result.value; // The raw text
//this prints all the data of docx file
//console.log(text);
console.log('------------------------------');
var textLines = text.split ("\n");
for (var i = 0; i < textLines.length; i++) {
//this prints all the data in separate lines
console.log(textLines[i]);
}
var messages = result.messages;
})
.done();
Upvotes: 0