icodekamal.com
icodekamal.com

Reputation: 141

How to remove duplicate lines from a file and writing them in file in Python

Below code gets all the Links in a HTML File and Writes it into a text file. But it is also copying the duplicate lines ( Links ). Is there any way to make sure it won't write the link which are already inside the file ? Any method So that I don't have to manually code a functionality ?

from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):

    def handle_starttag(self,tag,attrs):
        if tag=="a":
            if attrs.__len__>0:
                for a in attrs:
                    if a[0]=="href":
                        print a[1]
                        f=open("index_link.txt","a+")
                        f.write(a[1]+"\n")

def main():

    parser=MyHTMLParser()
    f=open("index.html")
    if f.mode=="r":
        contents=f.read()
        parser.feed(contents)
    else:
        print ("No file found")
    f=open("textfile.html","w+")
    f.write(contents)

if __name__=="__main__":
    main()

Upvotes: 2

Views: 118

Answers (3)

mirosval
mirosval

Reputation: 6822

Use set(). Instead of writing your links directly into a file (which is inefficient anyway), try this:

class MyHTMLParser(HTMLParser):
    def __init(self)__:
        super(HTMLParser, self).__init__()
        self.my_links = set()

    def handle_starttag(self,tag,attrs):
        if tag != "a" or attrs.__len__ == 0:
            return None

        for a in attrs:
            if a[0] == "href":
                self.my_links.add(a[1])

Then retrieve links:

parser = MyParser() 
# ... do your parsing here
links = parser.my_links 
with open('path/to/file', 'w') as f:
    for link in list(links):
        f.write(link)

Upvotes: -1

Daniel
Daniel

Reputation: 42748

You need to record found links yourself, e.g. with a set:

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.links_found = set()

    def handle_starttag(self,tag,attrs):
        if tag=="a" and attrs:
            for a in attrs:
                if a[0]=="href" and a[1] not in self.links_found:
                    self.links_found.add(a[1])
                    print a[1]
                    with open("index_link.txt","a+") as f:
                        f.write(a[1]+"\n")

You can also use a simple list, if you want to keep the links, in order, instead of writing them directly to a file:

from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.found_links = []

    def handle_starttag(self,tag,attrs):
        if tag=="a":
            attrs = dict(attrs)
            if "href" in attrs and attrs["href"] not in self.found_links:
                self.found_links.append(attrs["href"])

def main():
    parser = MyHTMLParser()
    with open("index.html") as f:
        contents = f.read()
    parser.feed(contents)
    with open("index_link.txt","w") as f:
        f.write('\n'.join(parser.found_links) + '\n')
    with open("textfile.html","w") as f:
        f.write(contents)

if __name__=="__main__":
    main()

Upvotes: 2

Prashant Puri
Prashant Puri

Reputation: 2334

It's simple just make use of list data type which will be list of links, here I am using html_links variable

from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):

    def __init__(self):
        super(self).__init__
        self.html_links = []
    def handle_starttag(self,tag,attrs):
        if tag=="a":
            if attrs.__len__>0:
                for a in attrs:
                    if a[0]=="href" and a[1] not in self.html_links:
                        print a[1]
                        self.html_links.append(a[1])
                        f=open("index_link.txt","a+")
                        f.write(a[1]+"\n")

def main():
    parser=MyHTMLParser()
    f=open("index.html")
    if f.mode=="r":
        contents=f.read()
        parser.feed(contents)
    else:
        print ("No file found")
    f=open("textfile.html","w+")
    f.write(contents)

if __name__=="__main__":
    main()

Upvotes: 1

Related Questions