Reputation: 107

python how to reset the url when i define the url in the loop

there was a question when I want to open a URL by URL urllib2 and cookielib. It is fine when I define the URL alone， but when I define in a loop， it doesn’t work and gets an error that couldn't find the URL.

There is my code. I want to get some solution to reset the URL or another way to solve the question.

I just did some work about getting the cookie in the front of the code to get the password. so i think that it cause the problem. we should wipe cache or reset the URL ？

import urllib2
import os
from cookielib import CookieJar
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.inLink = False
        self.dataList = []
        self.directory = '/'
        self.indexcol = ';'
        self.Counter = 0

    def handle_starttag(self, tag, attrs):
        self.inLink = False
        if tag == 'table':
            self.Counter += 1
        if tag == 'a':
            for name, value in attrs:
                if name == 'href':
                    if self.directory in value or self.indexcol in value:
                        break
                    else:
                        self.inLink = True
                        self.lasttag = tag

    def handle_endtag(self, tag):
            if tag == 'table':
                self.Counter +=1

    def handle_data(self, data):
        if self.Counter == 1:
            if self.lasttag == 'a' and self.inLink and data.strip():
                self.dataList.append(data)

parser = MyHTMLParser() 


# Define function for batch downloading
def BatchJob(Files, cookie_jar):
    for dat in Files:
        if (dat.find("h11v28") != -1 or dat.find("h12v28") != -1) and dat.find("hdf") != -1 and dat.find("xml") == -1:
            print "downloading: ", dat
            JobRequest = urllib2.Request(url + dat)
            JobRequest.add_header('cookie', cookie_jar)  # Pass the saved cookie into additional HTTP request
            JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401'

            # Request the resource at the modified redirect url
            Request = urllib2.Request(JobRedirect_url)
            Response = urllib2.urlopen(Request)
            f = open(dat, 'wb')
            f.write(Response.read())
            f.close()
            Response.close()
        else:
            continue
    print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__))
        # The user credentials that will be used to authenticate access to the data
# The user credentials that will be used to authenticate access to the data
username = ""
password = ""

# Create a password manager to deal with the 401 reponse that is returned from
# Earthdata Login
password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password)

# Create a cookie jar for storing cookies. This is used to store and return
# the session cookie given to use by the data server (otherwise it will just
# keep sending us back to Earthdata Login to authenticate).  Ideally, we
# should use a file based cookie jar to preserve cookies between runs. This
# will make it much more efficient.

cookie_jar = CookieJar()

# Install all the handlers.
opener = urllib2.build_opener(
    urllib2.HTTPBasicAuthHandler(password_manager),
    # urllib2.HTTPHandler(debuglevel=1),    # Uncomment these two lines to see
    # urllib2.HTTPSHandler(debuglevel=1),   # details of the requests/responses
    urllib2.HTTPCookieProcessor(cookie_jar))
urllib2.install_opener(opener)

# Create and submit the requests. There are a wide range of exceptions that
# can be thrown here, including HTTPError and URLError. These should be
# caught and handled.

# ===============================================================================
# Open a requeset to grab filenames within a directory. Print optional
# ===============================================================================
# The FULL url of the directory which contains the files you would like to bulk download

for x in range(1, 31):
    if x < 10:
        url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD29P1N.006/2015.11.0' + str(x) + '/'
    else:
        url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD29P1N.006/2015.11.' + str(x).lstrip("0") + '/'
    DirRequest = urllib2.Request(url)
    DirResponse = urllib2.urlopen(DirRequest)

    # Get the redirect url and append 'app_type=401'
    # to do basic http auth
    DirRedirect_url = DirResponse.geturl()
    if x == 1:
        DirRedirect_url += '&app_type=401'

    # Request the resource at the modified redirect url
    DirRequest = urllib2.Request(DirRedirect_url)
    DirBody = urllib2.urlopen(DirRequest).read()

    # DirBody = DirResponse.read(DirResponse)
    # Uses the HTML parser defined above to pring the content of the directory containing data
    parser.feed(DirBody)
    Files = parser.dataList

    # Display the contents of the python list declared in the HTMLParser class
    # print Files #Uncomment to print a list of the files

    # ===============================================================================
    # Call the function to download all files in url
    # ===============================================================================
    BatchJob(Files, cookie_jar)  # Comment out to prevent downloading to your working directory

wehen the code run in x=2 ,the error occur

Traceback (most recent call last):
  File "F:/IST/NSIDC_Parse_HTML_BatchDL.py", line 136, in <module>
    DirBody = urllib2.urlopen(DirRequest).read()
  File "D:\software\python2.7\lib\urllib2.py", line 154, in urlopen
    return opener.open(url, data, timeout)
  File "D:\software\python2.7\lib\urllib2.py", line 435, in open
    response = meth(req, response)
  File "D:\software\python2.7\lib\urllib2.py", line 548, in http_response
    'http', request, response, code, msg, hdrs)
  File "D:\software\python2.7\lib\urllib2.py", line 473, in error
    return self._call_chain(*args)
  File "D:\software\python2.7\lib\urllib2.py", line 407, in _call_chain
    result = func(*args)
  File "D:\software\python2.7\lib\urllib2.py", line 556, in http_error_default
    raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: Not Found

Upvotes: 0

Answers (3)

leaf

Reputation: 1764

First 404 ERROR should be fixed by:

#DirRedirect_url += '&app_type=401'
if x == 1:
    DirRedirect_url += '&app_type=401'

Then another 404 caused in BatchJob should be fixed by:

        #JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401'
        JobRedirect_url = urllib2.urlopen(JobRequest).geturl()

Seems thant '&app_type=401' needs be added only once.

Full code:

import urllib2
import os
from cookielib import CookieJar
from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.inLink = False
        self.dataList = []
        self.directory = '/'
        self.indexcol = ';'
        self.Counter = 0

    def handle_starttag(self, tag, attrs):
        self.inLink = False
        if tag == 'table':
            self.Counter += 1
        if tag == 'a':
            for name, value in attrs:
                if name == 'href':
                    if self.directory in value or self.indexcol in value:
                        break
                    else:
                        self.inLink = True
                        self.lasttag = tag

    def handle_endtag(self, tag):
            if tag == 'table':
                self.Counter += 1

    def handle_data(self, data):
        if self.Counter == 1:
            if self.lasttag == 'a' and self.inLink and data.strip():
                self.dataList.append(data)

parser = MyHTMLParser()


# Define function for batch downloading
def BatchJob(Files, cookie_jar):
    for dat in Files:
        if (dat.find("h11v28") != -1 or dat.find("h12v28") != -1) and dat.find("hdf") != -1 and dat.find("xml") == -1:
            print "downloading: ", dat
            JobRequest = urllib2.Request(url + dat)
            JobRequest.add_header('cookie', cookie_jar)  # Pass the saved cookie into additional HTTP request
            #JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401'
            JobRedirect_url = urllib2.urlopen(JobRequest).geturl()

            # Request the resource at the modified redirect url
            Request = urllib2.Request(JobRedirect_url)
            Response = urllib2.urlopen(Request)
            f = open(dat, 'wb')
            f.write(Response.read())
            f.close()
            Response.close()
        else:
            continue
    print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__))
# The user credentials that will be used to authenticate access to the data
username = ""       # sorry the username is not mine ,so i couldn't...
password = ""

# Create a password manager to deal with the 401 reponse that is returned from
# Earthdata Login
password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password)

# Create a cookie jar for storing cookies. This is used to store and return
# the session cookie given to use by the data server (otherwise it will just
# keep sending us back to Earthdata Login to authenticate).  Ideally, we
# should use a file based cookie jar to preserve cookies between runs. This
# will make it much more efficient.

cookie_jar = CookieJar()

# Install all the handlers.
opener = urllib2.build_opener(
    urllib2.HTTPBasicAuthHandler(password_manager),
    # urllib2.HTTPHandler(debuglevel=1),    # Uncomment these two lines to see
    # urllib2.HTTPSHandler(debuglevel=1),   # details of the requests/responses
    urllib2.HTTPCookieProcessor(cookie_jar))
urllib2.install_opener(opener)

    # Create and submit the requests. There are a wide range of exceptions that
    # can be thrown here, including HTTPError and URLError. These should be
    # caught and handled.

# ===============================================================================
# Open a requeset to grab filenames within a directory. Print optional
# ===============================================================================
# The FULL url of the directory which contains the files you would like to bulk download
for x in range(1, 3):
    if x < 10:
        url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD29P1N.006/2015.11.0' + str(x) + '/'
    else:
        url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD29P1N.006/2015.11.' + str(x) + '/'

    print(url)
    DirRequest = urllib2.Request(url)
    DirResponse = urllib2.urlopen(DirRequest)
    # Get the redirect url and append 'app_type=401'
    # to do basic http auth
    DirRedirect_url = DirResponse.geturl()
    #DirRedirect_url += '&app_type=401'
    if x == 1:
        DirRedirect_url += '&app_type=401'
    # Request the resource at the modified redirect url
    DirRequest = urllib2.Request(DirRedirect_url)
    print(DirRedirect_url)
    DirBody = urllib2.urlopen(DirRequest).read()
    # DirBody = DirResponse.read(DirResponse)
    # Uses the HTML parser defined above to pring the content of the directory containing data
    parser.feed(DirBody)
    Files = parser.dataList
    print(Files)
    # Display the contents of the python list declared in the HTMLParser class
    # print Files #Uncomment to print a list of the files

    # ===============================================================================
    # Call the function to download all files in url
    # ===============================================================================
    BatchJob(Files, cookie_jar)  # Comment out to prevent downloading to your working directory

Upvotes: 1

Fejs

Reputation: 2888

Found the bug. Solution is:

import urllib2
import os
from cookielib import CookieJar
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.inLink = False
        self.dataList = []
        self.directory = '/'
        self.indexcol = ';'
        self.Counter = 0

    def handle_starttag(self, tag, attrs):
        self.inLink = False
        if tag == 'table':
            self.Counter += 1
        if tag == 'a':
            for name, value in attrs:
                if name == 'href':
                    if self.directory in value or self.indexcol in value:
                        break
                    else:
                        self.inLink = True
                        self.lasttag = tag

    def handle_endtag(self, tag):
        if tag == 'table':
            self.Counter +=1

    def handle_data(self, data):
        if self.Counter == 1:
            if self.lasttag == 'a' and self.inLink and data.strip():
                self.dataList.append(data)




# Define function for batch downloading
def BatchJob(Files, cookie_jar):
    for dat in Files:
        if (dat.find("h11v28") != -1 or dat.find("h12v28") != -1) and dat.find("hdf") != -1 and dat.find("xml") == -1:
            print "downloading: ", url + dat
            JobRequest = urllib2.Request(url + dat)
            JobRequest.add_header('cookie', cookie_jar)  # Pass the saved cookie into additional HTTP request
            JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401'

            # Request the resource at the modified redirect url
            Request = urllib2.Request(JobRedirect_url)
            Response = urllib2.urlopen(Request)
            f = open(dat, 'wb')
            f.write(Response.read())
            f.close()
            Response.close()
        else:
            continue
    print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__))

# The user credentials that will be used to authenticate access to the data
username = ""
password = ""

# Create a password manager to deal with the 401 reponse that is returned from
# Earthdata Login
password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password)

# Create a cookie jar for storing cookies. This is used to store and return
# the session cookie given to use by the data server (otherwise it will just
# keep sending us back to Earthdata Login to authenticate).  Ideally, we
# should use a file based cookie jar to preserve cookies between runs. This
# will make it much more efficient.

cookie_jar = CookieJar()

# Install all the handlers.
opener = urllib2.build_opener(
    urllib2.HTTPBasicAuthHandler(password_manager),
    #urllib2.HTTPHandler(debuglevel=1),    # Uncomment these two lines to see
    #urllib2.HTTPSHandler(debuglevel=1),   # details of the requests/responses
    urllib2.HTTPCookieProcessor(cookie_jar))
urllib2.install_opener(opener)

# Create and submit the requests. There are a wide range of exceptions that
# can be thrown here, including HTTPError and URLError. These should be
# caught and handled.

# ===============================================================================
# Open a requeset to grab filenames within a directory. Print optional
# ===============================================================================
# The FULL url of the directory which contains the files you would like to bulk download

for x in range(1, 31):
    if x < 10:
        url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD29P1N.006/2015.11.0' + str(x) + '/'
    else:
        url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD29P1N.006/2015.11.' + str(x).lstrip("0") + '/'
    DirRequest = urllib2.Request(url)
    DirResponse = urllib2.urlopen(DirRequest)

    # Get the redirect url and append 'app_type=401'
    # to do basic http auth
    DirRedirect_url = DirResponse.geturl()
    if x == 1:
        DirRedirect_url += '&app_type=401'

    # Request the resource at the modified redirect url
    DirRequest = urllib2.Request(DirRedirect_url)
    DirBody = urllib2.urlopen(DirRequest).read()

    # DirBody = DirResponse.read(DirResponse)
    # Uses the HTML parser defined above to pring the content of the directory containing data
    parser = MyHTMLParser()
    parser.feed(DirBody)
    Files = parser.dataList

    # Display the contents of the python list declared in the HTMLParser class
    print Files #Uncomment to print a list of the files

    # ===============================================================================
    # Call the function to download all files in url
    # ===============================================================================
    BatchJob(Files, cookie_jar)

Please notice where I instantiate MyHTMLParser. If You instantiate it in original place, all data that is feed to parser remains there, meaning that file names for x=1 will be still present in Files, which leads to 404.

Upvotes: 0

DeKaNszn

Reputation: 2730

You should use simple formatting for getting string with leading zero:

url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD29P1N.006/2015.11.{:02}/'
for x in range(1, 31):
    DirRequest = urllib2.Request(url.format(x))
    DirResponse = urllib2.urlopen(DirRequest)

Upvotes: 0

python how to reset the url when i define the url in the loop

Answers (3)

Related Questions