Reputation: 4763
I want to write a program in python beautiful soup to hyperlink words in html using the csv file with anchor_text and hyperlink
The CSV file that has 2 columns:
anchor_text | hyperlink |
---|---|
https://www.google.com | |
Bing | https://bing.com |
Yahoo | https://yahoo.com |
Active Campaign | https://activecampaign.com |
Here is sample HTML
<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another Google Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another lowercase bing Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another multi word Active Campaign Text</p>
<!-- /wp:paragraph -->
I want the output to be
<!-- wp:paragraph -->
<p>This is a existing link <a href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another <a href="https://www.google.com/">Google</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another lowercase <a href="https://bing.com/">bing</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another multi word <a href="https://activecampaign.com/">Active Campaign</a> Text</p>
<!-- /wp:paragraph -->
Any help is appreciated
Upvotes: 0
Views: 126
Reputation: 4710
You should try with anchor/links on the outer loop and then break down the matching strings in the inner loop:
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
from bs4 import element as bs4_element
import csv
html_doc = """
<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another Google Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another lowercase bing Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another multi word Active Campaign.</p>
<!-- /wp:paragraph -->
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# read the CSV file with anchor text and hyperlinks
with open('file.csv', 'r') as csv_file:
reader = csv.reader(csv_file)
hyperlinks = dict(reader)
# from bs4 import element as bs4_element
be_navStr = bs4_element.NavigableString
hList = [
(anchor_text.strip(), hyperlink.strip()) for
anchor_text, hyperlink in hyperlinks.items()
if anchor_text.strip() and hyperlink.strip() # no blanks
]
print('#'*35, 'OLD', '#'*35, '\n')
print(soup, '\n')
print('#'*75, '\n\n\n')
for txt, link in hList:
navStrs = [
d for d in soup.descendants if type(d) == be_navStr
# and f' {txt.lower()} ' in f' {d.string.strip().lower()} ' # same as
# and f' {txt.lower()} ' in f' {d.string.strip().lower()} ' # same as
# and (' '+txt.lower()+' ') in (' '+d.string.strip().lower()+' ')
and (' '+re.sub('\W+',' ',txt.lower())+' ') in (' '+re.sub('\W+',' ',d.string.strip().lower())+' ') # Handles special characters like ?.!
]
for ns in navStrs:
# tLen, remStr = len(txt), f' {ns.get_text().strip()} '
tLen, remStr = len(txt), f' {ns.string} '
remLen = len(remStr)
# tLen, remStr = len(txt), f' {ns.text.strip()} '
if remStr[1:-1].lower() == txt.lower():
# to skip if it's already a hyperlink
if ns.parent.name == 'a':
ns.parent['href'] = link # comment if you dont want to replace/update link
continue
# Skip creating nested hyperlinks inside existing hyperlinks
if ns.parent.name == 'a':
continue
i = 0
while ' '+re.sub('\W+',' ',txt.lower())+' ' in re.sub('\W+',' ',remStr.lower()) and remStr.lower().find(f'{txt.lower()}') > -1:
#print(txt.lower())
#print(re.sub('\W+',' ',remStr.lower()))
sInd = remStr.lower().find(f'{txt.lower()}')
#print(remStr.lower())
#print(sInd)
hlTag = soup.new_tag('a', href=link)
hlTag.append(remStr[sInd:sInd + tLen])
#print(hlTag)
if i == 0:
newCont = [remStr[1:sInd], hlTag]
else:
newCont = [remStr[:sInd], hlTag]
#print(newCont)
for addn in newCont: ns.insert_before(addn)
#print(soup)
remStr = remStr[sInd + tLen:remLen-1]
#print(remStr)
i += 1
ns.replace_with(remStr)
#print(soup)
print('#'*35, 'NEW', '#'*35, '\n')
print(soup, '\n')
print('#'*75)
printed output:
################################### OLD ###################################
<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another Google Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another lowercase bing Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another multi word Active Campaign.</p>
<!-- /wp:paragraph -->
###########################################################################
################################### NEW ###################################
<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another <a href="https://www.google.com">Google</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another lowercase <a href="https://bing.com">bing</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another multi word <a href="https://activecampaign.com">Active Campaign</a>.</p>
<!-- /wp:paragraph -->
###########################################################################
This should work even with multiple matches in the same string as long as they don't overlap (like "Google Chrome" and "Chrome Beta")
Upvotes: 1