Reputation: 69
I am trying to read a binary file which contains data between Tags (xml style), and I using a page like this:
<!DOCTYPE html>
<html lang="en">
<body>
<header>
<h1>Load a File</h1>
</header>
<main>
<input type="file" id="file">
</main>
<script>
function onfilechange(evt) {
var selFile = evt.target.files[0];
var reader = new FileReader();
reader.onloadend = function (e) {
var h3d =new Int8Array(e.target.result);
console.log(h3d);
console.log(enc.decode(h3d));
};
reader.readAsArrayBuffer(selFile);
}
document.getElementById('file').addEventListener('change', onfilechange);
var enc = new TextDecoder("utf-8");
</script>
</body>
</html>
And I get this result for the file (first Console.log):
Int8Array(1025) [9, 0, 0, 0, 8, 60, 77, 79, 68, 69, 76, 79, 62, 9, 0, 0, 0, 8, 60, 80, 79, 78, 84, 79, 83, 62, 10, 0, 0, 0, 2, 0, 0, 0, 1, 65, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 36, 64, 2, 0, 0, 0, 1, 66, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 68, 64, 0, 0, 0, 0, 0, 0, 36, 64, 2, 0, 0, 0, 1, 67, 0, 0, 0, 0, …]
and second Console.log:
<MODEL>
<POINTS>A$@$@$@B$@D@$@CN@D@$@DN@$@$@EN@$@>@FN@D@>@GD@$@I@HD@D@I@
I$@$@I@J$@D@I@</POINTS>
<FACES>ABCDADEGIIGHJAIJBHGEFFEDCJHFCB</FACES>
<SYSTEM></SYSTEM>
</MODEL>
How can i get the actual data stored in between the tags? I Should be geting some points coordinates like:
<MODEL>
<POINTS>A,10,10,10; B,10,20,30; ...</POINTS>
<FACES>1,A,B,C,D; 2,A,D,E,G,I;... </FACES>
<SYSTEM>...and some other stuff!</SYSTEM>
</MODEL>
Thank you!
Upvotes: 2
Views: 1396
Reputation: 1018
There is a lot going on here so let me see if I can explain.
First, what you are expecting as the output of this file is not the actual data in this file. If you are going to create a binary format you need to document it. For example - the points in this file are not separated by commas and semicolons, they are simply a single byte Ascii character followed by 3 - 8 byte numbers. As a note - the numbers used for the points are encoded in the opposite order of the rest of the file.
When you load a file from the OS you are getting the raw bytes of the file. Those bytes need to be converted into something usable and that is why there are APIs like TextDecoder
and FileReader
. TextDecoder
is designed to take a file that was encode as a text file (UTF-8, UTF-16, etc...) and convert it to a JavaScript string. This is not a text file, it is binary so you can't use TextDecoder
The reason that you are seeing the output that you are is because when you display a binary file in a text editor (The console is a text editor) it display each byte as the corresponding Ascii character. That is why you are seeing all the garbled text - they are actually ascii characters. The text editor doesn't know that they are supposed to be 8 byte numbers.
So, basically there are no JavaScript APIs that will convert this binary file into text because it doesn't know how - it doesn't know what the bytes it is reading represent. It also doesn't know how many bytes to read at a time (1 for a UTF-8 character, 2 for a UTF-16 character or 4 for an integer). You need to parse this file manually which isn't as scary as it seems. Right now you are using an Int8Array
which will work but you have to use Array indexing and you will have to use bit shifting to get the numbers. You should use DataView
because it provides an API to read different types from the byte stream.
This is an example of how to parse this file.
let position = 0;
let h3d;
function onfilechange(evt) {
const selFile = evt.target.files[0];
const reader = new FileReader();
reader.onloadend = function (e) {
h3d = new DataView(e.target.result);
while (position < h3d.byteLength) {
if (isOpenBracket(h3d.getUint8(position++))) {
// we are looking to see if we are starting a closing tag
// don't increment position here - we just want to peek
if (isSlash(h3d.getUint8(position))) {
while (!isCloseBracket(h3d.getUint8(position++))) {
// we don't need the closing tag so we will skip over it
// just read forward to the closing braket
// after this completes the 'position' will be on the byte representing the closing tag
}
// go back to the outer while loop
continue;
}
let tag = readTag(h3d, position);
switch (tag) {
case 'PONTOS':
case 'POINTS':
const points = readPoints();
case 'FACES':
const faces = readFaces();
// add other tags you want to parse
}
}
}
};
reader.readAsArrayBuffer(selFile);
}
document.getElementById('file').addEventListener('change', onfilechange);
function readTag() {
const tag = [];
// start reading tag
// read until we find the closing bracket
// after this completes the 'position' will be on the byte representing the closing tag
while (!isCloseBracket(val = h3d.getUint8(position++))) {
tag.push(val);
}
return String.fromCodePoint(...tag);
}
function readPoints() {
const points = {}; // or use a Map
// do this until we hit the openning bracket of the closing tag
while (!isOpenBracket(val = h3d.getUint8(position++))) {
// the points are an upper case letter followed by 3 - 8 byte numbers
// so if we hit an uppercase letter read the next 3 - 8 byte sequences as numbers
if (isChar(val)) {
let arr = [];
// we need to read these number as LittleEndian because that is how they are in the file
arr.push(h3d.getFloat64(position, true));
position += 8;
arr.push(h3d.getFloat64(position, true));
position += 8;
arr.push(h3d.getFloat64(position, true));
position += 8;
points[String.fromCodePoint(val)] = arr;
}
}
return points;
}
function readFaces() {
// don't know what to do here because I don't know what the format of this data is.
}
// these functions check Ascii values - no need to covert them to strings
function isWhitesapce(value) {
return value === 32 ||
value === 9 ||
value === 10 ||
value === 11 ||
value === 12 ||
value === 13
}
function isOpenBracket(value) {
return value === 60;
}
function isCloseBracket(value) {
return value === 62;
}
function isSlash(value) {
return value === 47;
}
function isChar(value) {
// upper case letters
return value >= 41 && value <= 90;
}
This is just quick and dirty. I would create a separate class that parses this file format.
A few things to note:
readPoints
function the endianness of the getFloat64
is set to true to use Little Endian because the data for the points in the file are encoded backwards.That should be enough to figure out how to parse the rest of the file. You just need to know what the data format in each tag is.
Upvotes: 2