Sirwan Afifi
Sirwan Afifi

Reputation: 10824

How to recursively construct an JSON hierarchy from a NodeList?

Given the following input:

<dl>
  <dt>
    <h3>Title A</h3>
    <dl>
      <dt>
        <h3>Title A- A</h3>
        <dl>
          <dt><a href="#">Item</a></dt>
          <dt><a href="#">Item</a></dt>
        </dl>
      </dt>
      <dt><a href="#">Item</a></dt>
      <dt><a href="#">Item</a></dt>
      <dt><a href="#">Item</a></dt>
      <dt><a href="#">Item</a></dt>
      <dt>
        <h3>Title B- A</h3>
        <dl>
          <dt><a href="#">Item</a></dt>
          <dt><a href="#">Item</a></dt>
        </dl>
      </dt>
      <dt><a href="#">Item</a></dt>
    </dl>
  </dt>
</dl>

I want to build an JSON object based on the above input:

{
  "title": "Title A",
  "children": [
    {
      "title": "Title A- A",
      "children": [
        {"title": "Item"},
        {"title": "Item"}
      ]
    },
    {"title": "Item"},
    {"title": "Item"},
    {"title": "Item"},
    {"title": "Item"},
    {
      "title": "Title B- A",
      "children": [
        {"title": "Item"},
        {"title": "Item"}
      ]
    },
    {"title": "Item"}
  ]
}

Here's what I have tried so far:

function buildTree(node) {
    if (!node) return [];
    const h3 = node.querySelector('h3') || node.querySelector('a');
    let result = {
        title: h3.innerText,
        children: []
    };
    const array = [...node.querySelectorAll('dl')];
    if (array) {
        result.children = array.map(el => buildTree(el.querySelector('dt')));
    }
    return result;
}

The result I'm getting is different from what I expect, Here's the result I am getting:

{
  "title": "Title A",
  "children": [
    {
      "title": "Title A",
      "children": [
        {
          "title": "Title A- A",
          "children": [
            {
              "title": "Item A- A 1",
              "children": []
            }
          ]
        },
        {
          "title": "Item A- A 1",
          "children": []
        },
        {
          "title": "Title B- A 1",
          "children": []
        }
      ]
    },
    {
      "title": "Title A- A",
      "children": [
        {
          "title": "Item A- A 1",
          "children": []
        }
      ]
    },
    {
      "title": "Item A- A 1",
      "children": []
    },
    {
      "title": "Title B- A 1",
      "children": []
    }
  ]
}

seems that some data are not there, Any idea what I might be missing?

Upvotes: 3

Views: 466

Answers (4)

Mulan
Mulan

Reputation: 135227

fix html

First I would remark that you are misusing dl. From the MDN docs -

The HTML <dl> element represents a description list. The element encloses a list of groups of terms (specified using the <dt> element) and descriptions (provided by <dd> elements) ...

Here's what the correct use of dl, dt, and dd would look like -

<dl>
  <dt>Title 1</dt>
  <dd>  
    <dl>
      <dt>Title 1.1</dt>
      <dd><a href="#">Item 1.1.1</a></dd>
      <dd><a href="#">Item 1.1.2</a></dd>
    </dl>
  </dd>
  <dd><a href="#">Item 1.2</a></dd>
  <dd><a href="#">Item 1.3</a></dd>
  <dd><a href="#">Item 1.4</a></dd>
  <dd><a href="#">Item 1.5</a></dd>
  <dd>
    <dl>
      <dt>Title 1.6</dt>    
      <dd><a href="#">Item 1.6.1</a></dd>
      <dd><a href="#">Item 1.6.2</a></dd>
    </dl>
  </dd>
  <dd><a href="#">Item 1.7</a></dd>
</dl>

Notice it matches the expected shape of your output -

{
  "title": "Title 1",
  "children": [
    {
      "title": "Title 1.1",
      "children": [
        {"title": "Item 1.1.1"},
        {"title": "Item 1.1.2"}
      ]
    },
    {"title": "Item 1.2"},
    {"title": "Item 1.3"},
    {"title": "Item 1.4"},
    {"title": "Item 1.5"},
    {
      "title": "Title 1.6",
      "children": [
        {"title": "Item 1.6.1"},
        {"title": "Item 1.6.2"}
      ]
    },
    {"title": "Item 1.7"}
  ]
}

fromHtml

If you are not willing (or able) to change the input html as described above, please see Scott's wonderful answer. To write a program for the proposed html, I would break it into two parts. First we write fromHtml with a simple recursive form -

function fromHtml (e)
{ switch (e?.tagName)
  { case "DL":
      return Array.from(e.childNodes, fromHtml).flat()
    case "DD":
      return [ Array.from(e.childNodes, fromHtml).flat() ]
    case "DT":
    case "A":
      return e.textContent
    default:
      return []
   }
}

fromHtml(document.querySelector('dl'))

Which gives us this intermediate format -

[
  "Title 1",
  [
    "Title 1.1",
    [ "Item 1.1.1" ],
    [ "Item 1.1.2" ]
  ],
  [ "Item 1.2" ],
  [ "Item 1.3" ],
  [ "Item 1.4" ],
  [ "Item 1.5" ],
  [
    "Title 1.6",
    [ "Item 1.6.1" ],
    [ "Item 1.6.2" ]
  ],
  [ "Item 1.7" ]
]

applyLabels

Following that, I would write a separate applyLabels function which adds the title and children labels you require -

const applyLabels = ([ title, ...children ]) =>
  children.length
    ? { title, children: children.map(applyLabels) }
    : { title }
  
const result =
  applyLabels(fromHtml(document.querySelector('dl')))
{
  "title": "Title 1",
  "children": [
    {
      "title": "Title 1.1",
      "children": [
        {"title": "Item 1.1.1"},
        {"title": "Item 1.1.2"}
      ]
    },
    {"title": "Item 1.2"},
    {"title": "Item 1.3"},
    {"title": "Item 1.4"},
    {"title": "Item 1.5"},
    {
      "title": "Title 1.6",
      "children": [
        {"title": "Item 1.6.1"},
        {"title": "Item 1.6.2"}
      ]
    },
    {"title": "Item 1.7"}
  ]
}

I might suggest one final change, which guarantees all nodes in the output have a uniform shape, { title, children }. It's a change worth noting because in this case applyLabels is easier to write and it behaves better -

const applyLabels = ([ title, ...children ]) =>
  ({ title, children: children.map(applyLabels) })

Yes, this means that deepest descendants will have an empty children: [] property, but it makes consuming the data much easier as we don't have to null-check certain properties.


demo

Expand the snippet below to verify the results of fromHtml and applyLabels in your own browser -

function fromHtml (e)
{ switch (e?.tagName)
  { case "DL":
      return Array.from(e.childNodes, fromHtml).flat()
    case "DD":
      return [ Array.from(e.childNodes, fromHtml).flat() ]
    case "DT":
    case "A":
      return e.textContent
    default:
      return []
   }
}

const applyLabels = ([ title, ...children ]) =>
  children.length
    ? { title, children: children.map(applyLabels) }
    : { title }
  
const result =
  applyLabels(fromHtml(document.querySelector('dl')))
  
console.log(result)
<dl>
  <dt>Title 1</dt>
  <dd>  
    <dl>
      <dt>Title 1.1</dt>
      <dd><a href="#">Item 1.1.1</a></dd>
      <dd><a href="#">Item 1.1.2</a></dd>
    </dl>
  </dd>
  <dd><a href="#">Item 1.2</a></dd>
  <dd><a href="#">Item 1.3</a></dd>
  <dd><a href="#">Item 1.4</a></dd>
  <dd><a href="#">Item 1.5</a></dd>
  <dd>
    <dl>
      <dt>Title 1.6</dt>    
      <dd><a href="#">Item 1.6.1</a></dd>
      <dd><a href="#">Item 1.6.2</a></dd>
    </dl>
  </dd>
  <dd><a href="#">Item 1.7</a></dd>
</dl>


remarks

I've written hundreds of answers on the topic of recursion and data transformation and yet this is the first time I think I've used .flat in an essential way. I thought I had a use case in this Q&A but Scott's comment took it from me! This answer differs because domNode.childNodes is not a true array and so Array.prototype.flatMap cannot be used. Thanks for the interesting problem.

Upvotes: 2

Scott Sauyet
Scott Sauyet

Reputation: 50797

This is a clear-cut case for mutual recursion. This is straightforward to process if we distinguish how to handle a DL and how to handle a DT. (As others have pointed out, without any DD's this is an odd structure.)

I added an id to the initial DL to make it easy to get hold of. But however you choose to grab this root element, you should be able to just pass it to handleDl to get back your structure.

const handleDl = (dl) => 
  [...dl.children]
    .filter (({nodeName}) => nodeName == 'DT')
    .map (handleDt)

const handleDt = (dt) => {
  const kids = [...dt.children]
  const h3 = kids .find (({nodeName}) => nodeName == 'H3')
  const dl = kids .find (({nodeName}) => nodeName == 'DL')
  return h3
    ? {title: h3.textContent, children: handleDl (dl)}
    : {title: dt.textContent}
}

const node = document.getElementById('foo')

console .log (handleDl (node))
.as-console-wrapper {max-height: 70% !important; top: 30%}
<dl id = "foo"><dt><h3>Title A</h3><dl><dt><h3>Title A- A</h3><dl><dt><a href="#">Item</a></dt><dt><a href="#">Item</a></dt></dl></dt><dt><a href="#">Item</a></dt><dt><a href="#">Item</a></dt><dt><a href="#">Item</a></dt><dt><a href="#">Item</a></dt><dt><h3>Title B- A</h3><dl><dt><a href="#">Item</a></dt><dt><a href="#">Item</a></dt></dl></dt><dt><a href="#">Item</a></dt></dl></dt></dl>

handleDl simply maps handleDt over all DT children of the node supplied.

handleDt is slightly more complicated, because there are two different styles. We find the first H3 and the first DL among the node's children. If an H3 was found, we choose our title from that, and process the DL into a children node, using handleDl. If no H3 was found, we simply report the title based on the current node's text content. You might have to get more sophisticated in deriving this text, assuming this is a simplification of your actual problem. But it shouldn't be difficult.

Update

Using the much more logical structure from @Thankyou, we can write this same style much more simply. (Note that there are many error paths that aren't checked. This could do with a dose of ?-nullish operators... an exercise for the reader.) This has a similar breakdown as above, but with simpler code. It is fairly dependent upon that specific structure, where every DD has either a single DL or a simple HTML wrapper around our title, and every DL has one DT followed by one or more DDs. But since that is the specified structure of DL, this isn't much of a hardship.

const handleDl = (dl) => ({
  title: dl .children [0] .textContent,
  children: [...dl .children] .slice (1) .map (handleDd)
})

const handleDd = (dd) => 
  dd .children [0] .nodeName == "DL"
    ? handleDl (dd .children [0])
    : {title: dd .textContent}

const node = document .querySelector ('dl')

console .log (handleDl (node))
.as-console-wrapper {max-height: 70% !important; top: 30%}
<dl><dt>Title 1</dt><dd><dl><dt>Title 1.1</dt><dd><a href="#">Item 1.1.1</a></dd><dd><a href="#">Item 1.1.2</a></dd></dl></dd><dd><a href="#">Item 1.2</a></dd><dd><a href="#">Item 1.3</a></dd><dd><a href="#">Item 1.4</a></dd><dd><a href="#">Item 1.5</a></dd><dd><dl><dt>Title 1.6</dt><dd><a href="#">Item 1.6.1</a></dd><dd><a href="#">Item 1.6.2</a></dd></dl></dd><dd><a href="#">Item 1.7</a></dd></dl>

Upvotes: 2

emi
emi

Reputation: 3070

You'd better make the decision before recursion:

function buildTree(node) {
  const result = {};
  for (const el of node.children) {
    switch(el.nodeName) {
      case 'H3':
      case 'A':
        result.title = el.innerText;
        result.children = [];
        break;
      case 'DL':
        result.children = buildTree(el);
        break;
      case 'DT':
        result.children.push(buildTree(el));
        break;
      default:
        console.warn(`Unknown node type '${el.nodeName}'`, el);
    }

  }
  return result;
}

With this example, I can see you are trying to parse almost equally DTs and DLs.

Upvotes: 2

likle
likle

Reputation: 1797

I think one important part is that querySelector and querySelectorAll are recursive. And could it be that you confused dl and dt, since there are multiple dt's in a dl? Would the following work for you?

function buildTree(node) {
    if (!node) return [];
    const h3 = node.querySelector(':scope > h3') || node.querySelector(':scope > a');
    let result = {
        title: h3.innerText,
        children: []
    };
    const array = [...node.querySelectorAll(':scope > dl > dt')];
    result.children = array.map(el => buildTree(el));
    return result;
}

you must initially pass a dt node for this to work. (https://jsfiddle.net/52ups6fL/)

Upvotes: 1

Related Questions