Reputation: 123
I have it set to "append" but when it's pulling the info from the table, it's throwing the header in for every record. i've read multiple threads on here but haven't found anything that worked. here are 4 of the urls i'm using and the code that i have.
http://www.wvlabor.com/new_searches/contractor_RESULTS.cfm?wvnumber=WV057804&contractor_name=&dba=&city_name=&County=&Submit3=Search+Contractors http://www.wvlabor.com/new_searches/contractor_RESULTS.cfm?wvnumber=WV057805&contractor_name=&dba=&city_name=&County=&Submit3=Search+Contractors http://www.wvlabor.com/new_searches/contractor_RESULTS.cfm?wvnumber=WV057806&contractor_name=&dba=&city_name=&County=&Submit3=Search+Contractors http://www.wvlabor.com/new_searches/contractor_RESULTS.cfm?wvnumber=WV057807&contractor_name=&dba=&city_name=&County=&Submit3=Search+Contractors
import csv
from urllib.request import urlopen
import pandas as pd
contents = []
with open('WV_urls.csv','r') as csvf: # Open file in read mode
urls = csv.reader(csvf)
for url in urls:
contents.append(url) # Add each url to list contents
for url in contents: # Parse through each url in the list.
page = urlopen(url[0]).read()
df, header = pd.read_html(page)
df.to_csv('WV_Licenses_Daily.csv', index=False, header=None, mode='a')
However, if i use just 2 urls by themselves, it prints the header and appends the second file.
calls_df, header = pd.read_html('http://www.wvlabor.com/new_searches/contractor_RESULTS.cfm?wvnumber=WV057728&contractor_name=&dba=&city_name=&County=&Submit3=Search+Contractors', header=0)
calls_df1, header = pd.read_html('http://www.wvlabor.com/new_searches/contractor_RESULTS.cfm?wvnumber=WV057729&contractor_name=&dba=&city_name=&County=&Submit3=Search+Contractors', header=0)
calls_df.to_csv('WV_Licenses_Daily.csv', index=False, header=None, mode='w')
calls_df1.to_csv('WV_Licenses_Daily.csv', index=False, header=None, mode='a')
Upvotes: 0
Views: 998
Reputation: 159
df, header = pd.read_html(page)
import csv
from urllib.request import urlopen
import pandas as pd
contents = []
with open('WV_urls.csv','r') as csvf: # Open file in read mode
urls = csv.reader(csvf)
for url in urls:
contents.append(url) # Add each url to list contents
for url in contents: # Parse through each url in the list.
page = urlopen(url[0]).read()
df, header = pd.read_html(page,header=0)
df.to_csv('WV_Licenses_Daily.csv', index=False, header=None, mode='a')
import csv
from urllib.request import urlopen
import pandas as pd
contents = []
df = pd.DataFrame(columns=['WV Number', 'Company', 'DBA', 'Address', 'City', 'State', 'Zip','County', 'Phone', 'Classification*', 'Expires']) #initialize the data frame with columns
with open('WV_urls.csv','r') as csvf: # Open file in read mode
urls = csv.reader(csvf)
for url in urls:
contents.append(url) # Add each url to list contents
for url in contents: # Parse through each url in the list.
page = urlopen(url[0]).read()
df1, header = pd.read_html(page,header=0)#reading with header
df=df.append(df1) # append to dataframe
df.to_csv('WV_Licenses_Daily.csv', index=False)
Upvotes: 1