Reputation: 141
Below code gets all the Links in a HTML File and Writes it into a text file. But it is also copying the duplicate lines ( Links ). Is there any way to make sure it won't write the link which are already inside the file ? Any method So that I don't have to manually code a functionality ?
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self,tag,attrs):
if tag=="a":
if attrs.__len__>0:
for a in attrs:
if a[0]=="href":
print a[1]
f=open("index_link.txt","a+")
f.write(a[1]+"\n")
def main():
parser=MyHTMLParser()
f=open("index.html")
if f.mode=="r":
contents=f.read()
parser.feed(contents)
else:
print ("No file found")
f=open("textfile.html","w+")
f.write(contents)
if __name__=="__main__":
main()
Upvotes: 2
Views: 118
Reputation: 6822
Use set()
. Instead of writing your links directly into a file (which is inefficient anyway), try this:
class MyHTMLParser(HTMLParser):
def __init(self)__:
super(HTMLParser, self).__init__()
self.my_links = set()
def handle_starttag(self,tag,attrs):
if tag != "a" or attrs.__len__ == 0:
return None
for a in attrs:
if a[0] == "href":
self.my_links.add(a[1])
Then retrieve links:
parser = MyParser()
# ... do your parsing here
links = parser.my_links
with open('path/to/file', 'w') as f:
for link in list(links):
f.write(link)
Upvotes: -1
Reputation: 42748
You need to record found links yourself, e.g. with a set
:
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links_found = set()
def handle_starttag(self,tag,attrs):
if tag=="a" and attrs:
for a in attrs:
if a[0]=="href" and a[1] not in self.links_found:
self.links_found.add(a[1])
print a[1]
with open("index_link.txt","a+") as f:
f.write(a[1]+"\n")
You can also use a simple list, if you want to keep the links, in order, instead of writing them directly to a file:
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.found_links = []
def handle_starttag(self,tag,attrs):
if tag=="a":
attrs = dict(attrs)
if "href" in attrs and attrs["href"] not in self.found_links:
self.found_links.append(attrs["href"])
def main():
parser = MyHTMLParser()
with open("index.html") as f:
contents = f.read()
parser.feed(contents)
with open("index_link.txt","w") as f:
f.write('\n'.join(parser.found_links) + '\n')
with open("textfile.html","w") as f:
f.write(contents)
if __name__=="__main__":
main()
Upvotes: 2
Reputation: 2334
It's simple just make use of list data type which will be list of links, here I am using html_links variable
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
super(self).__init__
self.html_links = []
def handle_starttag(self,tag,attrs):
if tag=="a":
if attrs.__len__>0:
for a in attrs:
if a[0]=="href" and a[1] not in self.html_links:
print a[1]
self.html_links.append(a[1])
f=open("index_link.txt","a+")
f.write(a[1]+"\n")
def main():
parser=MyHTMLParser()
f=open("index.html")
if f.mode=="r":
contents=f.read()
parser.feed(contents)
else:
print ("No file found")
f=open("textfile.html","w+")
f.write(contents)
if __name__=="__main__":
main()
Upvotes: 1