Reputation: 11
first time doing web scraping. I need to extract some dictionaries from two variables inside a script in the HTML. This is what I'm doing to extract the HTML:
url = "https://www.backstabbr.com/game/Nexus-Season6-Game37/5466300639084544#"
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
Zooming in on the HTML part I actually need, I have:
<script>
// NEW JAVSCRIPT!;
var stage = "NEEDS_ORDERS";
var orders = {};
var unitsByPlayer = {"Austria": {"Ven": "F"}, "England": {"BAL": "F", "Den": "A", "HEL": "F", "IRI": "F", "Lon": "F", "NAO": "F", "NTH": "F", "Stp": "A", "War": "A"}, "France": {"Ber": "A", "Bre": "F", "Bur": "A", "ENG": "F", "Hol": "F", "Kie": "A", "MAO": "F", "Naf": "A", "Ruh": "A", "Tyr": "A"}, "Italy": {"Mar": "F", "Tus": "A", "TYS": "F"}, "Turkey": {"Ank": "A", "Bud": "A", "Con": "F", "Gal": "A", "ION": "F", "LYO": "F", "Mos": "A", "Ser": "A", "Smy": "F", "Tri": "A", "Ukr": "A"}};
var territories = {"Lon": "England", "Lvp": "England", "Edi": "England", "Tri": "Turkey", "Bud": "Turkey", "Vie": "England", "Con": "Turkey", "Ank": "Turkey", "Smy": "Turkey", "Rom": "Italy", "Nap": "Italy", "Ven": "Austria", "Par": "France", "Mar": "Italy", "Bre": "France", "Sev": "Turkey", "Stp": "England", "Mos": "Turkey", "War": "England", "Ber": "France", "Mun": "France", "Kie": "France", "Den": "England", "Nwy": "England", "Bul": "Turkey", "Tun": "France", "Spa": "France", "Por": "France", "Rum": "Turkey", "Hol": "France", "Swe": "England", "Ser": "Turkey", "Gre": "Turkey", "Bel": "France"};
var activePlayer = null;
var unitChangeCount = {};
var buildableTerritories = [];
var unbuildableTerritories = [];
var retreatOptions = {};
var playerRetreatOrders = {}; // not sure this is used
var disable_engine = true;
var base_url = '/game/Nexus-Season6-Game37/5466300639084544';
var session_id = '';
var want_shaded_territories = none;
var gameType = 'game';
var nextAdjudicationTime = '2021-05-12 20:26:20.762615+00:00';
var gapi_p1 = 'MjAyMS0wNS0xMSAyMDoyNjoyMC43NjMwMDQ=';
</script>
I need to extract the dictionaries of unitsByPlayer and territories variables. Does anyone know how to do it using directly the variable soup? Thank you very much in advance!!
Upvotes: 1
Views: 640
Reputation: 195418
You can use re
/json
modules to parse the data:
import re
import json
import requests
url = "https://www.backstabbr.com/game/Nexus-Season6-Game37/5466300639084544#"
page = requests.get(url).text
unitsByPlayer = json.loads(
re.search(r"var unitsByPlayer = (\{.*\})", page).group(1)
)
territories = json.loads(
re.search(r"var territories = (\{.*\})", page).group(1)
)
# pretty print:
print(json.dumps(unitsByPlayer, indent=4))
print()
print(json.dumps(territories, indent=4))
Prints:
{
"Austria": {
"Ven": "F"
},
"England": {
"BAL": "F",
"Den": "A",
"HEL": "F",
"IRI": "F",
"Lon": "F",
"NAO": "F",
"NTH": "F",
"Stp": "A",
"War": "A"
},
"France": {
"Ber": "A",
"Bre": "F",
"Bur": "A",
"ENG": "F",
"Hol": "F",
"Kie": "A",
"MAO": "F",
"Naf": "A",
"Ruh": "A",
"Tyr": "A"
},
"Italy": {
"Mar": "F",
"Tus": "A",
"TYS": "F"
},
"Turkey": {
"Ank": "A",
"Bud": "A",
"Con": "F",
"Gal": "A",
"ION": "F",
"LYO": "F",
"Mos": "A",
"Ser": "A",
"Smy": "F",
"Tri": "A",
"Ukr": "A"
}
}
{
"Lon": "England",
"Lvp": "England",
"Edi": "England",
"Tri": "Turkey",
"Bud": "Turkey",
"Vie": "England",
"Con": "Turkey",
"Ank": "Turkey",
"Smy": "Turkey",
"Rom": "Italy",
"Nap": "Italy",
"Ven": "Austria",
"Par": "France",
"Mar": "Italy",
"Bre": "France",
"Sev": "Turkey",
"Stp": "England",
"Mos": "Turkey",
"War": "England",
"Ber": "France",
"Mun": "France",
"Kie": "France",
"Den": "England",
"Nwy": "England",
"Bul": "Turkey",
"Tun": "France",
"Spa": "France",
"Por": "France",
"Rum": "Turkey",
"Hol": "France",
"Swe": "England",
"Ser": "Turkey",
"Gre": "Turkey",
"Bel": "France"
}
Upvotes: 1