How can I read a binary file (xml style) in javascript?

Question

I am trying to read a binary file which contains data between Tags (xml style), and I using a page like this:

    


    
        Load a File

And I get this result for the file (first Console.log):

Int8Array(1025) [9, 0, 0, 0, 8, 60, 77, 79, 68, 69, 76, 79, 62, 9, 0, 0, 0, 8, 60, 80, 79, 78, 84, 79, 83, 62, 10, 0, 0, 0, 2, 0, 0, 0, 1, 65, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 36, 64, 2, 0, 0, 0, 1, 66, 0, 0, 0, 0, 0, 0, 36, 64, 0, 0, 0, 0, 0, 0, 68, 64, 0, 0, 0, 0, 0, 0, 36, 64, 2, 0, 0, 0, 1, 67, 0, 0, 0, 0, …]

and second Console.log:


 A$@$@$@B$@D@$@CN@D@$@DN@$@$@EN@$@>@FN@D@>@GD@$@I@HD@D@I@ 
 I$@$@I@J$@D@I@
 ABCDADEGIIGHJAIJBHGEFFEDCJHFCB

How can i get the actual data stored in between the tags? I Should be geting some points coordinates like:

 
   A,10,10,10; B,10,20,30; ...
   1,A,B,C,D; 2,A,D,E,G,I;...  
   ...and some other stuff!

Thank you!

Andrew Alderson · Accepted Answer

There is a lot going on here so let me see if I can explain.

First, what you are expecting as the output of this file is not the actual data in this file. If you are going to create a binary format you need to document it. For example - the points in this file are not separated by commas and semicolons, they are simply a single byte Ascii character followed by 3 - 8 byte numbers. As a note - the numbers used for the points are encoded in the opposite order of the rest of the file.

When you load a file from the OS you are getting the raw bytes of the file. Those bytes need to be converted into something usable and that is why there are APIs like TextDecoder and FileReader. TextDecoder is designed to take a file that was encode as a text file (UTF-8, UTF-16, etc...) and convert it to a JavaScript string. This is not a text file, it is binary so you can't use TextDecoder

The reason that you are seeing the output that you are is because when you display a binary file in a text editor (The console is a text editor) it display each byte as the corresponding Ascii character. That is why you are seeing all the garbled text - they are actually ascii characters. The text editor doesn't know that they are supposed to be 8 byte numbers.

So, basically there are no JavaScript APIs that will convert this binary file into text because it doesn't know how - it doesn't know what the bytes it is reading represent. It also doesn't know how many bytes to read at a time (1 for a UTF-8 character, 2 for a UTF-16 character or 4 for an integer). You need to parse this file manually which isn't as scary as it seems. Right now you are using an Int8Array which will work but you have to use Array indexing and you will have to use bit shifting to get the numbers. You should use DataView because it provides an API to read different types from the byte stream.

This is an example of how to parse this file.

let position = 0;
let h3d;
function onfilechange(evt) {
    const selFile = evt.target.files[0];
    const reader = new FileReader();
    reader.onloadend = function (e) {
        h3d = new DataView(e.target.result);
        while (position < h3d.byteLength) {
            if (isOpenBracket(h3d.getUint8(position++))) {
                // we are looking to see if we are starting a closing tag
                // don't increment position here - we just want to peek
                if (isSlash(h3d.getUint8(position))) {
                    while (!isCloseBracket(h3d.getUint8(position++))) {
                        // we don't need the closing tag so we will skip over it
                        // just read forward to the closing braket
                        // after this completes the 'position' will be on the byte representing the closing tag
                    }
                    // go back to the outer while loop
                    continue;
                }
                let tag = readTag(h3d, position);
                switch (tag) {
                    case 'PONTOS':
                    case 'POINTS':
                        const points = readPoints();
                    case 'FACES':
                        const faces = readFaces();
                     
                    // add other tags you want to parse

                }
            }
        }
    };
    reader.readAsArrayBuffer(selFile);
}

document.getElementById('file').addEventListener('change', onfilechange);

function readTag() {
    const tag = [];
    // start reading tag
    // read until we find the closing bracket
    // after this completes the 'position' will be on the byte representing the closing tag
    while (!isCloseBracket(val = h3d.getUint8(position++))) {
        tag.push(val);
    }
    return String.fromCodePoint(...tag);
}

function readPoints() {
    const points = {}; // or use a Map
    // do this until we hit the openning bracket of the closing tag
    while (!isOpenBracket(val = h3d.getUint8(position++))) {
    // the points are an upper case letter followed by 3 - 8 byte numbers
    // so if we hit an uppercase letter read the next 3 - 8 byte sequences as numbers
        if (isChar(val)) {
            let arr = [];
            // we need to read these number as LittleEndian because that is how they are in the file
            arr.push(h3d.getFloat64(position, true));
            position += 8;
            arr.push(h3d.getFloat64(position, true));
            position += 8;
            arr.push(h3d.getFloat64(position, true));
            position += 8;

            points[String.fromCodePoint(val)] = arr;
        }
    }
    return points;
}

function readFaces() {
    // don't know what to do here because I don't know what the format of this data is.
}
// these functions check Ascii values - no need to covert them to strings
function isWhitesapce(value) {
    return value === 32 ||
        value === 9 ||
        value === 10 ||
        value === 11 ||
        value === 12 ||
        value === 13
}

function isOpenBracket(value) {
    return value === 60;
}
function isCloseBracket(value) {
    return value === 62;
}
function isSlash(value) {
    return value === 47;
}
function isChar(value) {
// upper case letters
    return value >= 41 && value <= 90;
}

This is just quick and dirty. I would create a separate class that parses this file format.

A few things to note:

When using DataView you have to keep track of the position you are reading from. It doesn't move the pointer forward automatically.
In the readPoints function the endianness of the getFloat64 is set to true to use Little Endian because the data for the points in the file are encoded backwards.

That should be enough to figure out how to parse the rest of the file. You just need to know what the data format in each tag is.

How can I read a binary file (xml style) in javascript?

Answers (1)

Related Questions