ametefe
ametefe

Reputation: 113

How can I create a Python Dictionary with Selenium Web Scraping?

Someone suggested to me to try to use a Python Dictionary so I can easily extract data from it.

My program uses MyAnimeList to extract data from it. I then store it in a variable. I would like to convert this variable into a Dictionary. I think it is possible but I don't know how to do it

The variable looks something like this:

['Synonyms: Pocket Monsters, Indigo League, Adventures on the Orange Islands, The Johto Journeys, Johto League Champions, Master Quest', 'Japanese: ポケットモンスター', 'Type: TV', 'Episodes: 276', 'Status: Finished Airing', 'Aired: Apr 1, 1997 to Nov 14, 2002', 'Premiered: Spring 1997', 'Broadcast: Thursdays at 19:00 (JST)', 'Producers: TV Tokyo, TV Tokyo Music, Studio Jack', 'Licensors: VIZ Media, 4Kids Entertainment', 'Studios: OLM', 'Source: Game', 'Genres: Action, Adventure, Comedy, Kids, Fantasy', 'Duration: 24 min. per ep.', 'Rating: PG - Children', 'Score: 7.341 (scored by 291,570 users)', 'Ranked: #21572', 'Popularity: #287', 'Members: 504,076', 'Favorites: 4,076', '']

I would like it to automatically make it look like this:

information_dict = {
  "Synonyms": "Pocket Monsters, Indigo League, Adventures on the Orange Islands, The Johto Journeys, Johto League Champions, Master Quest",
  "Japanese": "ポケットモンスター",
  "Type": "TV",
  "Episodes": "276",
  "Status": "Finished Airing",
  "Aired": "Apr 1, 1997 to Nov 14, 2002",
  "Premiered": "Spring 1997",
  "Broadcast": "Thursdays at 19:00 (JST)",
  "Producers": "TV Tokyo, TV Tokyo Music, Studio Jack",
  "Licensors": "VIZ Media, 4Kids Entertainment",
  "Studios": "OLM",
  "Source": "Game",
  "Genres": "Action, Adventure, Comedy, Kids, Fantasy",
  "Duration": "24 min. per ep.",
  "Rating": "PG - Children",
  "Score": "7.341 (scored by 291,570 users)",
  "Ranked": "#21572",
  "Popularity": "#287",
  "Members": "504,076",
  "Favorites": "4,076"
}

This is what my code looks like:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

Anime = input("Enter Anime:")
driver = webdriver.Chrome(executable_path=r"C:\Users\amete\Documents\chromedriver.exe")

driver.get("https://myanimelist.net/search/all?q=one%20piece&cat=all")

search = driver.find_element_by_xpath('//input[@name="q"]')
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.XPATH, '//input[@name="q"]')))
#  Clears the field
search.send_keys(Keys.CONTROL, 'a')
search.send_keys(Keys.DELETE)

#  The field is now cleared and the program can type whatever it wants
search.send_keys(Anime)
search.send_keys(Keys.RETURN)

#  Accept the cookies
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="qc-cmp2-ui"]/div[2]/div/button[3]'))).click()

#  Added this wait
wait.until(EC.element_to_be_clickable((By.XPATH,
                                       '//h2[@id="anime"]//ancestor::div[@class="content-left"]//article[1]/div[contains(@class, "list")][1]/div[contains(@class, "information")]/a[1]')))
link = driver.find_element_by_xpath(
    '//h2[@id="anime"]//ancestor::div[@class="content-left"]//article[1]/div[contains(@class, "list")][1]/div[contains(@class, "information")]/a[1]').click()

# -----Extracting information-------#

# Extracting the Jap Title and the English Title
Titles = driver.find_element_by_xpath('//*[@id="contentWrapper"]/div[1]').text
Titles = Titles.split("\n")
print(Titles)
Titles.remove("Edit")
print(Titles)

Score = driver.find_element_by_xpath(
    '//*[@id="content"]/table/tbody/tr/td[2]/div[1]/table/tbody/tr[1]/td/div[1]/div[1]/div[1]/div[1]/div[1]/div').text
Episodes = driver.find_element_by_xpath('//*[@id="content"]/table/tbody/tr/td[1]/div/div[10]').text

print("The Score of the Anime is:" + str(Score))
print(Episodes)
# -------Other Information about the show--------#
Information_List = []
try:
    for i in range(7, 28):
        Info = driver.find_element_by_xpath('//*[@id="content"]/table/tbody/tr/td[1]/div/div[' + str(i) + ']').text
        Information_List.append(Info)
except:
    pass
print(Information_List)

# --------Extracting the data and putting it into variables--------#

# ------Genres-------#
for x, s in enumerate(Information_List):
        if "Genres" in s:
              Genre_Index = x

print (Information_List[Genre_Index])
Genre = (Information_List[Genre_Index])
Genre = Genre.replace("Genres: ","")
Genre = Genre.replace(" ","")
Genre = Genre.split(",")
print (Genre)

#-------Rating-------#
print (Information_List[14])

Upvotes: 1

Views: 1207

Answers (2)

AlexMoshi
AlexMoshi

Reputation: 339

source = ['Synonyms: Pocket Monsters, Indigo League, Adventures on the Orange Islands, The Johto Journeys, Johto League Champions, Master Quest', 'Japanese: ポケットモンスター', 'Type: TV', 'Episodes: 276', 'Status: Finished Airing', 'Aired: Apr 1, 1997 to Nov 14, 2002', 'Premiered: Spring 1997', 'Broadcast: Thursdays at 19:00 (JST)', 'Producers: TV Tokyo, TV Tokyo Music, Studio Jack', 'Licensors: VIZ Media, 4Kids Entertainment', 'Studios: OLM', 'Source: Game', 'Genres: Action, Adventure, Comedy, Kids, Fantasy', 'Duration: 24 min. per ep.', 'Rating: PG - Children', 'Score: 7.341 (scored by 291,570 users)', 'Ranked: #21572', 'Popularity: #287', 'Members: 504,076', 'Favorites: 4,076', '']

information_dict = {}

def add(_dict,key, value):
    _dict[key] = value

for item in source:
  words = item.split(':')
  try:
    add(information_dict,words[0],words[1])
  except Exception as e:
    print('End')
print(information_dict)

Output:

End
 {'Aired': ' Apr 1, 1997 to Nov 14, 2002',
 'Broadcast': ' Thursdays at 19',
 'Duration': ' 24 min. per ep.',
 'Episodes': ' 276',
 'Favorites': ' 4,076',
 'Genres': ' Action, Adventure, Comedy, Kids, Fantasy',
 'Japanese': ' ポケットモンスター',
 'Licensors': ' VIZ Media, 4Kids Entertainment',
 'Members': ' 504,076',
 'Popularity': ' #287',
 'Premiered': ' Spring 1997',
 'Producers': ' TV Tokyo, TV Tokyo Music, Studio Jack',
 'Ranked': ' #21572',
 'Rating': ' PG - Children',
 'Score': ' 7.341 (scored by 291,570 users)',
 'Source': ' Game',
 'Status': ' Finished Airing',
 'Studios': ' OLM',
 'Synonyms': ' Pocket Monsters, Indigo League, Adventures on the Orange Islands, The Johto Journeys, Johto League Champions, Master Quest',
 'Type': ' TV'}

Upvotes: 1

Patrik
Patrik

Reputation: 499

Something like this could help you:

informations = ['Synonyms: Pocket Monsters, Indigo League, Adventures on the Orange Islands, The Johto Journeys, Johto League Champions, Master Quest', 'Japanese: ポケットモンスター', 'Type: TV', 'Episodes: 276', 'Status: Finished Airing', 'Aired: Apr 1, 1997 to Nov 14, 2002', 'Premiered: Spring 1997', 'Broadcast: Thursdays at 19:00 (JST)', 'Producers: TV Tokyo, TV Tokyo Music, Studio Jack', 'Licensors: VIZ Media, 4Kids Entertainment', 'Studios: OLM', 'Source: Game', 'Genres: Action, Adventure, Comedy, Kids, Fantasy', 'Duration: 24 min. per ep.', 'Rating: PG - Children', 'Score: 7.341 (scored by 291,570 users)', 'Ranked: #21572', 'Popularity: #287', 'Members: 504,076', 'Favorites: 4,076', '']

information_dict  = {}
for item in informations:
  if ':' in item:
    splited = item.split(':')
    key = splited[0].strip()
    value = splited[1].strip()
    information_dict[key] = value

print(information_dict)

output:

{
    "Synonyms": "Pocket Monsters, Indigo League, Adventures on the Orange Islands, The Johto Journeys, Johto League Champions, Master Quest",
    "Japanese": "ポケットモンスター",
    "Type": "TV",
    "Episodes": "276",
    "Status": "Finished Airing",
    "Aired": "Apr 1, 1997 to Nov 14, 2002",
    "Premiered": "Spring 1997",
    "Broadcast": "Thursdays at 19",
    "Producers": "TV Tokyo, TV Tokyo Music, Studio Jack",
    "Licensors": "VIZ Media, 4Kids Entertainment",
    "Studios": "OLM",
    "Source": "Game",
    "Genres": "Action, Adventure, Comedy, Kids, Fantasy",
    "Duration": "24 min. per ep.",
    "Rating": "PG - Children",
    "Score": "7.341 (scored by 291,570 users)",
    "Ranked": "#21572",
    "Popularity": "#287",
    "Members": "504,076",
    "Favorites": "4,076"
}

Upvotes: 1

Related Questions