SecureEntrepeneur
SecureEntrepeneur

Reputation: 97

Python BeautifulSoup - Trying to format data into JSON

This is the code I have so far, where the commented code dumps all the data into a JSON file:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import json

response= requests.get('https://ofsistorage.blob.core.windows.net/publishlive/ConList.html')
soup = BeautifulSoup(response.text, 'lxml')
name_list = soup.find('body')
name_list_items = name_list.find_all('ol')
data = []

all_names = []
for li in soup.select('li:has(b:contains("Name 6:"))'):
    all_names.append([name.find_next_sibling(text=True).strip() for name in li.select('b')[:6]])

""" for name in name_list_items:
    list_items = name.find_all('li') 
    list_items = [item.text for item in list_items]
    data.append(list_items) """

with open('data.json', 'w') as f:
    json.dump(data, f)

Upvotes: 1

Views: 961

Answers (1)

Andrej Kesely
Andrej Kesely

Reputation: 195428

This script gets the information about the persons from the page and prints out string in JSON format:

import re
import json
import requests
from bs4 import BeautifulSoup


url = 'https://ofsistorage.blob.core.windows.net/publishlive/ConList.html'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

persons = []
for li in soup.select('li:has(b:contains("Name 6:"))'):
    name = [name.find_next_sibling(text=True).strip() for name in li.select('b')[:6]]
    name = [n for n in name if '/' not in n]
    if len(name) > 1:
        last, *_, first = name
    else:
        last, first = '-', name[0]

    dob = li.select_one('b:contains("DOB:")')
    dob = dob.find_next_sibling(text=True).strip().replace('\xa0', '') if dob else '-'

    pob = li.select_one('b:contains("POB:")')
    pob = pob.find_next_sibling(text=True).strip().replace('\xa0', '') if pob else '-'

    nationality = li.select_one('b:contains("Nationality:")')
    nationality =  nationality.find_next_sibling(text=True).strip().replace('\xa0', '') if nationality else '-'

    gender = re.findall(r'((?:fe)?male)', li.get_text(strip=True, separator=' '), flags=re.I)
    gender = gender[0] if gender else '-'

    other = li.select_one('b:contains("Other Information:")')
    other =  other.find_next_sibling(text=True).strip().replace('\xa0', '') if other else '-'

    persons.append({
        'firstname': first,
        'lastname': last,
        'about': {
            'date_of_birth': dob,
            'place_of_birth': pob,
            'nationality': nationality,
            'gender': gender
        },
        'other': other
    })

print(json.dumps(persons, indent=4))

Prints:

[
    {
        "firstname": "ABDUL AZIZ",
        "lastname": "ABBASIN",
        "about": {
            "date_of_birth": "--/--/1969.",
            "place_of_birth": "Sheykhan village, Pirkowti Area, Orgun District, Paktika Province, Afghanistan",
            "nationality": "-",
            "gender": "-"
        },
        "other": "UN Ref TAi.155. Key commander in the Haqqani Network (TAe.012) under Sirajuddin Jallaloudine Haqqani (TAi.144). Taliban Shadow Governor of Orgun District, Paktika Province, as of early 2010. Operated a training camp for non-Afghan fighters in Paktika Province. Has been involved in the transport of weapons to Afghanistan."
    },
    {
        "firstname": "AZIZIRAHMAN",
        "lastname": "ABDUL AHAD",
        "about": {
            "date_of_birth": "--/--/1972.",
            "place_of_birth": "Shega District, Kandahar Province, Afghanistan",
            "nationality": "Afghan",
            "gender": "-"
        },
        "other": "UN Ref TAi.121. Belongs to Hotak tribe."
    },
    {
        "firstname": "BARADAR",
        "lastname": "ABDUL AHMAD TURK",
        "about": {
            "date_of_birth": "--/--/1968.",
            "place_of_birth": "Yatimak village, Dehrawood District, Uruzgan Province, Afghanistan",
            "nationality": "Afghan",
            "gender": "-"
        },
        "other": "UN Ref TAi.024. Arrested in Feb 2010 and in custody in Pakistan. Extradition request to Afghanistan pending in Lahore High Court, Pakistan as of June 2011. Belongs to Popalzai tribe. Senior Taliban military commander and member of Taliban Quetta Council as of May 2007. DOB is approximate."
    },

... and so on.

Upvotes: 1

Related Questions