Use find_next_sibling() for specific class value only

Question

I have a bunch of p elements in a page of HTML and using BeautifulSoup to parse the HTML page. The page is an index of an online book. What I need to do is create a nested JSON structure where there is currently none, as some terms of the index are children of a single term. So you can think of the index like this:

parent term
    child term
    child term
    child term
parent term
parent term

However, the HTML is not nested, it is listed in all

tags, like below. As you can see, the term Action(s) is a parent term and has 8 children. Then the next parent term is Actionable Insights and has 0 children. I have a loop that iterates through each

tag, and need to nest the children under the parent in the JSON file. So I can't use find_next_siblings() (plural), because it will just get all

tags indiscriminately. But if I can find a way to use find_next_sibling() (singular), but only those with the 'class': 'index2', and add them to a list, then I can add that list as a child to the parent term. At least, this is my logic so far.

A
    Acceptance of insights, merit-based, 3
    Accuracy of data, 125, 126
    Action(s):
    of audience, “so what?” question about, 133–135
    communicating to turn insights into, 10–12
    in deriving value from analytics, 11–12
    driving, see Driving action
    empowering audience to act, 178–180
    in 4D Framework, 128–132
    inspired by insights, 9–10
    as objective of communication, 36, 37
    Actionable insights, 51, 132–135
    Additive annotations, 244
    Aggregating data, 232
    AGT/HEED, 108–109
    Aha Moment:
    connecting Hook and, 176
    in Data Storytelling Arc, 163–167
    in data trailers, 181, 182, 292–293
    identified in storyboarding, 172–173
    initial interest generated by, 178
    in manufacturing gross margin story, 295
    in Rosling story, 273
    in US education system story, 286

The problem, however, is I can't figure out the logic for it. It's complicated, because I need recursion as well. But I keep getting NoneType errors (noted below). The rest of the code works, if I take out that codeblock I'm stuck on. But how can I use BeautifulSoup to only get the next

tag with a class of index2? At least the children are identified as index2. I just want to avoid scanning the entire document every time I need a few children terms. It seems like it should be straight-forward, but have not had luck. Thanks for your help.

MY CODE:

from bs4 import BeautifulSoup
import json

# convert html to bs4 object
def bs4_convert(file):
    with open(file, encoding='utf8') as fp:
        html = BeautifulSoup(fp, 'html.parser')
    return html

# create a tag
def p_parser(el, link_prefix):
    tags = []
    for p in el:
        tag = {
            'tag': p.text,
            'definition': '',
            'source': [{'title': link.text, 'href': link_prefix + link['href']} for link in p.find_all('a', recursive=False)]
        }
        # add all child terms of a parent term to a list

STUCK HERE, THIS CODEBLOCK KEEPS GETTING SNAGGED ON NONETYPE ERROR, SAYING p.find_next_sibling('p')['class'] IS NOT SUBSCRIPTABLE. EVEN THOUGH I CHECK FOR NONE.

        children = []
        if(p.find_next_sibling('p') is not None):
            while(p.find_next_sibling('p')['class'] == ['index2']):
                next_child = p.find_next_sibling('p')
                if(next_child is not None):
                    children.append(next_child)
                    p = next_child
                else:
                    break
                
        # make child tags
        tag['children'] = p_parser(children, link_prefix)

        tags.append(tag)

    return tags

# loop through all indices
def html_parser(html, link_prefix):
    tags = []
    # extract index
    html.find('section', {'role': "doc-index"})
    # iterate over every indented letter in index
    letters = html.find_all('section')
    for letter in letters:
        tags += p_parser(letter.find_all('p'), link_prefix)

    return tags

# add the course name as parent to all tags
def add_course_tag(course_name, tags):
    complete_tags = {
        'tag': course_name,
        'definition': '',
        'source': tags
    }

    return complete_tags

# write tags to JSON file
def write_to_json(course_name, tags):
    # Serializing json 
    json_object = json.dumps(tags, indent = 4)

    # Writing to course_name.json
    with open(course_name + '_tags.json', 'w') as outfile:
        outfile.write(json_object)

def main():
    # course information for the book
    course = {
        'course': 'data_storytelling', # exact course name
        'file': 'data_storytelling.html', # the html file you extracted
        'parse_type': 'index'
    }

    # this link prefix should be the same for all pages of one book
    prefix_id = 'effective-data-storytelling/9781119615712'
    link_prefix = 'https://learning.oreilly.com/library/view/' + prefix_id + '/'

    tags = []
    # parse the html
    html = bs4_convert(course['file'])
    # create tags
    tags = html_parser(html, link_prefix)
    # add course name as outermost tag
    tags = add_course_tag(course['course'], tags)
    # write results to json file
    write_to_json(course['course'], tags)

if __name__ == "__main__":
    main()

EDIT: I tried this code, but it would not stop running in the command line (and nothing new printed to the JSON file).

# create a tag
def p_parser(el, link_prefix):
    tags = []
    for p in el:
        tag = {
            'tag': p.text,
            'definition': '',
            'source': [{'title': link.text, 'href': link_prefix + link['href']} for link in p.find_all('a', recursive=False)]
        }
        # add all child terms of a parent term to a list
        children = []
        for child in p.next_siblings:
            if child.name == 'p' and 'index2' not in child['class']:
                break
            elif child.name == 'p' and 'index2' in child['class']:
                children.append(child) 

        tags.append(tag)
        # make child tags
        tag['children'] = p_parser(children, link_prefix)

    return tags

HedgeHog · Accepted Answer

Your are close to your goal, just some little adjustment to do - While iterating check for tag.name as well as its class and break if it is not a

with class containing index2:

children = []

for c in p.next_siblings:
    if c.name == 'p' and 'index2' not in c['class']:
        break
    elif c.name == 'p' and 'index2' in c['class']:
        children.append(c)

Example

Just to demonstrate, but I believe you would adapt it to your code.

import requests,bs4
html='''
A
    Acceptance of insights, merit-based, 3
    Accuracy of data, 125, 126
    Action(s):
    of audience, “so what?” question about, 133–135
    communicating to turn insights into, 10–12
    in deriving value from analytics, 11–12
    driving, see Driving action
    empowering audience to act, 178–180
    in 4D Framework, 128–132
    inspired by insights, 9–10
    as objective of communication, 36, 37
    Actionable insights, 51, 132–135
    Additive annotations, 244
    Aggregating data, 232
    AGT/HEED, 108–109
    Aha Moment:
    connecting Hook and, 176
    in Data Storytelling Arc, 163–167
    in data trailers, 181, 182, 292–293
    identified in storyboarding, 172–173
    initial interest generated by, 178
    in manufacturing gross margin story, 295
    in Rosling story, 273
    in US education system story, 286
'''
soup = bs4.BeautifulSoup(html)

# this link prefix should be the same for all pages of one book
prefix_id = 'effective-data-storytelling/9781119615712'
link_prefix = 'https://learning.oreilly.com/library/view/' + prefix_id + '/'

data = []

for p in soup.select('p.index1'):
    tag = {
            'tag': p.text,
            'definition': '',
            'source': [{'title': link.text, 'href': link_prefix + link['href']} for link in p.find_all('a', recursive=False)],
            'children':[]
        }
    
    for c in p.next_siblings:
        if c.name == 'p' and 'index1' in c['class']:
            break
        elif c.name == 'p' and 'index2' in c['class']:
            tag['children'].append({
                'tag': c.text,
                'definition': '',
                'source': [{'title': link.text, 'href': link_prefix + link['href']} for link in c.find_all('a', recursive=False)],
            })
    data.append(tag)
    
data

EDIT

#create tag
def create_tag(p, link_prefix):
    tag = {
        'tag': p.text,
        'definition': '',
        'source': [{'title': link.text, 'href': link_prefix + link['href']} for link in p.find_all('a', recursive=False)]
    }
return tag

#parse p and p children
def p_parser(el, link_prefix):
    tags = []
    for p in el:
        tag = create_tag(p, link_prefix)
        # add all child terms of a parent term to a list
        children = []
        for child in p.next_siblings:
            if child.name == 'p' and 'index2' not in child['class']:
                break
            elif child.name == 'p' and 'index2' in child['class']:
                if child is not None:
                    children.append(create_tag(child, link_prefix)) 
       
        # make child tags
        if children:
            tag['children'] = children

        # add any parent tags to tags
        tags.append(tag)

    return tags

Use find_next_sibling() for specific class value only

Answers (1)

Example

EDIT

Related Questions