bluerubez
bluerubez

Reputation: 300

pyodbc insert failing silently after hours of working correctly

I have here a scraper. It works for hours then all the sudden the inserts are not making it into the table. The program keeps going but the table remains unchanged... The only errors i see that i get are Primary Key errors because some of the rows are duplicates and i do not want to insert them anyway.

from bs4 import BeautifulSoup
from datetime import datetime
import mechanize,cookielib,pyodbc,socket,sys
import httplib

url = 'www'
base= 'www'

proxies = {'http': 'proxy'}
username='u'
pw = 'p'

cnxnstring = 'DRIVER={SQL Server};SERVER=s;DATABASE=DB;UID=u;PWD=p'
insert="""INSERT INTO TxProductionPreStaging(LeaseName,LeaseNo,DistrictNo,WellNo,ProdMonth,ProdYear,ProdDate,OilBBL,CasingHeadMCF,GWGasMCF,CondensateBBL,LastScraped)
VALUES(?,?,?,?,?,?,?,?,?,?,?,?)"""

def initReq():

br = mechanize.Browser()
br.set_proxies(proxies)
br.add_proxy_password(username, pw)
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_redirect(True)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

while True:
    try:
        soup = BeautifulSoup(br.open(url,timeout=20).read())
        if soup is not None:
            if soup.head.title.text=='Texas RRC - Railroad Commission of Texas Site Policies':
                print 'REDIRECT PAGE'
            else:
                break
    except (mechanize.URLError,mechanize.HTTPError,httplib.IncompleteRead) as exc:
        if isinstance(exc.reason, socket.timeout):
            print exc
    except Exception as error:
        print error

return br

def initForm( br, prodMonth ):
br.select_form('SearchCriteriaForm')
br.form.set_all_readonly(False)
br.form.find_control(name='viewType').value = ['Lease']
br["startMonth"]=[prodMonth]
br["startYear"]=[prodYear]
br["endMonth"]=[prodMonth]
br["endYear"]=[prodYear]
br["district"]=['Statewide']

r=br.submit(nr=2) 

return r

def bs( r ):

    soup = BeautifulSoup(r.read())

    return soup

def getTags( soup ):

    bigL=[]
    mini=[]
    for node in soup.findAll(attrs={'class': 'DataGrid'}):
        for i in node.findAll('tr'):
            if i.find('td'):
                for j in i.findAll('td'):
                    s = str(j.text);s= s.replace('\r\n',''); s=s.replace(' ','').strip('-').strip('\n')
                    mini.append(s)
                bigL.append(mini[:])
                del mini[:]

    return bigL

def insertTable( bigL, cnxn, cursor ,prodMonth,  prodDate):

        print 'INSERT TABLE'
        global c
        for i,item in enumerate(bigL):
            leaseName=bigL[i][0]
            leaseNo=bigL[i][1]
            districtNo=bigL[i][2]
            wellNo=bigL[i][3]
            oil=int(bigL[i][4].replace(',',''))
            casingHead=int(bigL[i][5].replace(',',''))
            gas=int(bigL[i][6].replace(',',''))
            condensate=int(bigL[i][7].replace(',',''))
            dt = datetime.now()
            try:
                cursor.execute(insert,leaseName,leaseNo,districtNo,wellNo,prodMonth,prodYear,prodDate,oil,casingHead,gas,condensate,dt)
                cnxn.commit()
            except pyodbc.Error as e:
                print e
                cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=u;DATABASE=DB;UID=us;PWD=p');
                cursor = cnxn.cursor()

        return

def baseUrl( prodYear ):

    months=['01','02','03','04','05','06','07','08','09','10','11','12']
    for item in months:
        prodMonth=str(item)
        prodDate=str(prodMonth)+'/01/'+str(prodYear)
        prodDate=datetime.strptime(prodDate, '%m/%d/%Y')
        br = initReq()
        r = initForm( br, prodMonth )
        soup = bs( r )
        L = getTags( soup )
        cnxn, cursor = getcnxn()
        insertTable( L, cnxn, cursor, prodMonth, prodDate )
        count = 20;
        while True:
            cs= str(count)
            count = count +20
            print count,cs

            while True:
                try:
                    soup = BeautifulSoup( br.open(base+cs, timeout=20).read())
                    if soup is not None:
                        if soup.head.title.text=='Texas RRC - Railroad Commission of Texas Site Policies':
                            print 'REDIRECT PAGE'
                        else:
                            break
                except (mechanize.URLError,mechanize.HTTPError, httplib.IncompleteRead) as exc:
                    print exc
                except Exception as e:
                    print e

            var=soup.prettify(formatter=None)

            if 'No Matches Found' in var:
                break
            else:
                L = getTags( soup )
                insertTable( L, cnxn, cursor, prodMonth, prodDate )
    return

def getcnxn():

    while True:
        try:
            cnxn = pyodbc.connect(cnxnstring);
            cursor = cnxn.cursor()
            break
        except:
            print sys.exc_info()[1]
    return cnxn, cursor

if __name__ == '__main__':

    prodYear=str(sys.argv[1]);

    baseUrl( prodYear )

    cnxn.close()

Upvotes: 0

Views: 310

Answers (1)

bluerubez
bluerubez

Reputation: 300

The one thing that helped with this is to try to get a cursor periodically. This tests the connection. I am web scraping so with every new page I:

    try:
        cursor = cnxn.cursor()
    except e:
        cnxn ==reinit()
        cursor = cnxn.cursor()

Edit: Also pyodbc was not catching the error properly... This is why i thought it was silently failing. Turns out that i just had to catch all errors to see where it was failing.

Upvotes: 1

Related Questions