Reputation: 188
Is there a simple way to extract all text on a webpage that is within quotation marks? Simply parsing the HTML code as string doesn't do the trick it seems.
Upvotes: 1
Views: 388
Reputation: 123
Replace the yahoo link with any link you want. This will return a list of all sentences and words between double quotes.
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib
import re
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(body):
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
html = urllib.urlopen('https://news.yahoo.com/poll-biden-leads-trump-four-165851808.html').read()
alltext = text_from_html(html)
charmap = { 0x201c : u'"',
0x201d : u'"',
0x2018 : u"'",
0x2019 : u"'" }
alltext = alltext.translate(charmap)
final = re.findall(r'"([^"]*)"', alltext)
Upvotes: 1