Reputation: 109
I'm goofing off with cython and a pretty big for loop - over a million. It takes about 40 minutes to run regular when I run as a regular python program.
vetdns.pyx and labeled cdef variables just below declaring the function -
now = datetime.datetime.now()
today = now.strftime("%Y-%m-%d")
my_date = date.today()
dayoftheweek=calendar.day_name[my_date.weekday()]
#needed because of the weird naming and time objects vs datetime objects
read_date = datetime.datetime.strptime(today, '%Y-%m-%d')
previous_day = read_date - datetime.timedelta(days=1)
yesterday = previous_day.strftime('%Y-%m-%d')
my_dir = os.getcwd()
# extracted = "extracted_"+today
outname = "alexa_all_vetted"+today
downloaded_file = "top-1m"+today+".zip"
INPUT_FILE="dns-all"
OUTPUT_FILE="dns_blacklist_"+dayoftheweek
REMOVE_FILE="dns_blacklist_upto"+yesterday
PATH = "/home/money/Documents/hybrid"
FULL_FILENAME= os.path.join(PATH, OUTPUT_FILE)
CLEANUP_FILENAME=os.path.join(PATH, REMOVE_FILE)
##cdef outname, INPUT_FILE, OUTPUT_FILE labeled just inside function.
def main():
zip_file_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
urllib.urlretrieve(zip_file_url, downloaded_file)
###naming variables affected in for loop
cdef outname, INPUT_FILE, OUTPUT_FILE
with zipfile.ZipFile(downloaded_file) as zip_file:
for member in zip_file.namelist():
filename = os.path.basename(member)
# skip directories
if not filename:
continue
# copy file (taken from zipfile's extract)
source = zip_file.open(member)
target = file(os.path.join(my_dir, filename), "wb")
with source, target:
shutil.copyfileobj(source, target)
whitelist = open(outname,'w')
with open(member,'r') as member:
reader = csv.reader(member, delimiter=',')
alexia_hosts = []
for row in reader:
alexia_hosts.append(row[1])
whitelist.write("\n".join(alexia_hosts))
file_out=open(FULL_FILENAME,"w")
with open(INPUT_FILE, 'r') as dnsMISP:
with open(outname, 'r') as f:
alexa=[]
alexafull=[]
blacklist = []
for line in f:
line = line.strip()
alexahostname=urltools.extract(line)
alexa.append(alexahostname[4])
alexafull.append(line)
for line in dnsMISP:
line = line.strip()
hostname = urltools.extract(line)
# print hostname[4]
if hostname[4] in alexa:
print hostname[4]+",this hostname is in alexa"
pass
elif hostname[5] in alexafull:
print hostname[5]+",this hostname is in alexafull"
else:
blacklist.append(line)
file_out.write("\n".join(blacklist))
file_out.close()
main()
Built setup.py
from distutils.core import setup
from Cython.Build import cythonize
setup(
ext_modules = cythonize("vetdns.pyx")
)
But when I run
python setup.py build_ext --inplace
I get the following errors -
Error compiling Cython file:
------------------------------------------------------------
...
source = zip_file.open(member)
target = file(os.path.join(my_dir, filename), "wb")
with source, target:
shutil.copyfileobj(source, target)
whitelist = open(outname,'w')
^
------------------------------------------------------------
vetdns.pyx:73:25: local variable 'outname' referenced before assignment
This is probably a little beyond me right now but I wanted to play around with it anyway.
Upvotes: 0
Views: 1719
Reputation: 25833
You declare outname
as a local variable on this line:
cdef outname, INPUT_FILE, OUTPUT_FILE
but then you never assign anything to it. Python requires that variables are assigned before you can use them, there is no default value they get initialized to.
I see that you've got a global variable named "outname", if you want to use the global variable you don't need to use a cdef
inside your function. Same applies to your other global variables.
One thing you can try that's worked well for me in the past is to pop out just the loop into a cythonized function. This way there is less cython code to debug/optimize but when most of the processing time is spent in just a few lines of code (which is often the case), compiling just these lines can make a big difference. In practice, that looks a little like this:
# my_script.py
import os
from my_helper import bottle_neck
def main():
a = 12
b = 22
c = 999
# More prep code
print bottle_neck(a, b, c)
main()
And in a different file:
# my_helper.pyx
def bottle_neck(int a, int b, int c):
# Silly example, this loop might never finish
while a > 0:
a = a & b
b = a - c
c = b * a
return a, b, c
Make sure you profile your code, it suck to assume something is slow to discover that it's in fact pretty fast only after you've taken the time to optimize it.
Upvotes: 2