Reputation: 145
I have the following code adapted from here that I am using with Node.js and Cheerio to read html files and split large source files into small chunks. The code is working well for a single file.
Now I need to read multiple large html files and split them one after the other and output the resulting files in a folder. How can I read and write every file in the folder and then split it?
Here is the code:
var cheerio = require('cheerio'),
fs = require('fs');
fs.readFile('./sourceHtml2/testone.html', 'utf8', dataLoaded);
function dataLoaded(err, data) {
$ = cheerio.load(data);
$('#toplevel > div').each(function (i, elem) {
var id = $(elem).attr('id'),
filename = id + '.html',
content = $.html(elem);
fs.writeFile('./output2/' + filename, content, function (err) {
console.log('Written html to ' + filename);
});
});
}
Here is my sample source file
<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Lorem Ipsum</title>
</head>
<body>
<div id="toplevel">
<div id="1-1">
<h1>HTML Ipsum Presents One</h1>
<p>
<strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper.
<h2>Header Level 2</h2>
<ol>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ol>
<h3>Header Level 3</h3>
<ul>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ul>
</div>
<div id="1-2">
<h1>HTML Ipsum Presents Two</h1>
<p>
<strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper.
<h2>Header Level 2</h2>
<ol>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ol>
<blockquote>
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus magna. Cras in mi at felis aliquet congue. Ut a est eget ligula molestie gravida. Curabitur massa. Donec eleifend, libero at sagittis mollis, tellus est malesuada tellus,
at luctus turpis elit sit amet quam. Vivamus pretium ornare est.</p>
</blockquote>
<h3>Header Level 3</h3>
<ul>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ul>
</div>
<div id="1-3">
<h1>HTML Ipsum Presents Three</h1>
<p>
<strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper.
<h2>Header Level 2</h2>
<ol>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ol>
<blockquote>
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus magna. Cras in mi at felis aliquet congue. Ut a est eget ligula molestie gravida. Curabitur massa. Donec eleifend, libero at sagittis mollis, tellus est malesuada tellus,
at luctus turpis elit sit amet quam. Vivamus pretium ornare est.</p>
</blockquote>
<h3>Header Level 3</h3>
<ul>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ul>
</div>
</div>
</body>
</html>
Your help will be greatly appreciated.
Upvotes: 0
Views: 853
Reputation: 5425
You need to process the files in the input directory as an array and you'll also want to prevent filename collisions in the output folder.
The code provided below provides a solution to both issues. HTML files (.htm and .html) are read from the 'input' subfolder and the generated files written to the 'output' subfolder.
var cheerio = require('cheerio'),
fs = require('fs');
// process files found in the 'input' folder
fs.readdir('./input', 'utf8', findHtmlFiles);
function findHtmlFiles(err, files) {
if (files.length) {
files.forEach(function (fullFilename) {
var pattern = /\.[0-9a-z]{1,5}$/i;
var ext = (fullFilename).match(pattern);
// only process '.htm' and '.html' files
if (ext[0] == '.htm' || ext[0] == '.html') {
fs.readFile('./input/' + fullFilename, 'utf8', function (err, data) {
if (err)
throw err
else {
// add the file name to prevent collisions
// in the output folder
var fileData = {
file: fullFilename.slice(0, (ext[0].length * -1)),
data: data
};
dataLoaded(null, fileData);
}
});
}
});
}
}
function dataLoaded(err, fd) {
$ = cheerio.load(fd.data);
$('#toplevel > div').each(function (i, elem) {
var id = $(elem).attr('id'),
filename = fd.file + '_' + id + '.html',
content = $.html(elem);
fs.writeFile('./output/' + filename, content, function (err) {
console.log('Written html to ' + filename);
});
});
}
Sample console output:
Written html to testone_1-1.html
Written html to testone_1-2.html
Written html to testone_1-3.html
Written html to testtwo_1-1.html
Written html to testtwo_1-2.html
Written html to testtwo_1-3.html
Upvotes: 1