Reputation: 17
I am using zipline-reloaded with 'quandl' bundle. I am trying to construct a simple 1-year momentum factor that I want it demeaned by sector. It gives me errors on indexing as will be explained below. Bear with me for the explanation as zipline is already lengthy to use.
First, I generate prices dataset using the below snippet
bundle_name = 'quandl'
trading_calendar = get_calendar('NYSE')
universe_end_date = pd.Timestamp('2016-01-05')
years=5 #used to specify start date as offset from end date to get data retrieved from 'Quandl'
field = 'close' #field from OHLCV fields
domain = US_EQUITIES
universe = AverageDollarVolume(window_length=120).top(500)
pipeline = Pipeline(screen=universe, domain=domain)
bundle_data = bundles.load(bundle_name)
def choose_loader(column):
if column not in USEquityPricing.columns:
raise Exception('Column not in USEquityPricing')
return pricing_loader
def get_pricing(data_portal, trading_calendar, assets, start_date, end_date, field=field):
end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d')) + pd.offsets.CustomBusinessDay()
start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d')) + pd.offsets.CustomBusinessDay()
end_b = trading_calendar.closes.index.get_loc(end_dt)
str_b = trading_calendar.closes.index.get_loc(start_dt)
return data_portal.get_history_window(assets=assets, end_dt=end_dt, bar_count=end_b-str_b, frequency='1d', field=field, data_frequency='daily')
pricing_loader = USEquityPricingLoader(bundle_data.equity_daily_bar_reader, bundle_data.adjustment_reader,None)
engine = SimplePipelineEngine(default_domain=domain, get_loader=choose_loader, asset_finder=bundle_data.asset_finder)
universe_tickers = engine.run_pipeline(pipeline, universe_end_date, universe_end_date).index.get_level_values(1).values.tolist()
data_portal = DataPortal(bundle_data.asset_finder, trading_calendar=trading_calendar,
first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day, equity_minute_reader=None,
equity_daily_reader=bundle_data.equity_daily_bar_reader, adjustment_reader=bundle_data.adjustment_reader)
prices = get_pricing(data_portal, trading_calendar, universe_tickers, universe_end_date - pd.DateOffset(years=years), universe_end_date)
Now to check the 'prices' data frame, it gives the below columns and rows showing data (rows deleted here for irrelevance):
Equity(0 [A]) Equity(2 [AAL]) Equity(7 [AAP]) Equity(8 [AAPL]) Equity(12 [ABBV]) Equity(13 [ABC]) Equity(20 [ABT]) Equity(27 [ACE]) Equity(37 [ACN]) Equity(45 [ADBE]) ... Equity(3149 [XEL]) Equity(3150 [XL]) Equity(3151 [XLNX]) Equity(3156 [XOM]) Equity(3165 [XRX]) Equity(3171 [YELP]) Equity(3172 [YHOO]) Equity(3175 [YUM]) Equity(3180 [ZBH]) Equity(3197 [ZTS])
The next step is finding out the sector data for each symbol, so I used yahoo finance. The variance shown below is where I assign a number for each sector against each symbol based on the symbol index (equity number). So I have a numpy array of length 3198 and each element is filled with a number corresponding to the sector. It is worth noting that the above equities are in total 500 equities indeed as defined by the 'universe', and I tried a variant of the below to generate a numpy array only of length 500. Both give the same error (though index numbers are different of course as will be shown below).
So, this is the code I used to generate the sector data:
tickers = prices.columns.values.tolist() #tickers from prices columns in the form of 'Equity n ([symbol])'
tick_seq = [int(str(t).split('(')[1].split('[')[0]) for t in tickers] #get list of equity index (n)
tick_name = [str(t).split('[')[1][:-2] for t in tickers] #get list of symbol names
tick_tuple = list(zip(tick_seq,tick_name)) #collect index and symbol name to list of tuples
tick_list = [list(i) for i in tick_tuple] #convert list of tuples to list of lists
def get_sector(sym):
try:
sector = yf.Ticker(sym[1]).info['sector'] #use yahoo finance to get sector information
except:
sector = 'NoSector'
return [sym[0], sym[1], sector]
with ThreadPoolExecutor() as t:
sectors = list(t.map(get_sector, tick_list)) #returns list of lists, each inner list is [index, symbol, sector]
sectors_set = set([s[2] for s in sectors]) #get set of sectors
sectors_set.remove('NoSector')
sectors_set = {v:i for i,v in enumerate(sorted(sectors_set))} #assign a number for each sector
sectors_set['NoSector']=-1 # identify an unfound sector with -1
tmp = [s.append(sectors_set[s[2]]) for s in sectors] #append sector identifier number to each list; it becomes [index, symbol, sector, sec_no]
ix = [int(s[0]) for s in sectors] #extract again the indices of the symbols
val = [int(s[-1]) for s in sectors] #extract the sec_no for each index
sectors_np = np.zeros(max(ix)+1) #since equity indices are not sequential, we take the max found
sectors_np[:]=-1 #fill all as not found
sectors_np[ix]=val #fill the found sectors at their corresponding symbol index
sectors_np = sectors_np.astype(int) #convert the array to integer
The next step is to create a 'Classifier' as follows:
class Sector(Classifier):
dtype = int64_dtype
window_length = 1
inputs = ()
missing_value = -1
def __init__(self):
self.data = s_np #sector_np
def _compute(self, arrays, dates, assets, mask):
print('sector data shape: {}'.format(self.data.shape))
print('assets shape: {}'.format(assets.shape))
print('mask shape: {}'.format(mask.shape))
return np.where(mask, self.data[assets], self.missing_value)
The last step is to create the factor, which I do as follows:
factor_start_date = universe_end_date - pd.DateOffset(years=2, days=2)
universe = AverageDollarVolume(window_length=120).top(500)
sector = Sector()
pipeline = Pipeline(screen=universe, domain=domain)
M1YR = Returns(window_length=252, mask=universe).demean(groupby=sector).rank().zscore()
pipeline.add(M1YR,'M1YR')
all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
And here I get an error on running the pipeline saying:
Cell In[210], line 14, in Sector._compute(self, arrays, dates, assets, mask)
12 print('assets shape: {}'.format(assets.shape))
13 print('mask shape: {}'.format(mask.shape))
---> 14 return np.where(mask, self.data[assets], self.missing_value)
IndexError: index 3198 is out of bounds for axis 0 with size 3198
And the prints I put in the _compute function of the class print the following:
sector data shape: (3198,)
assets shape: (3175,)
mask shape: (505, 3175)
I would really appreciate if you can help me out identifying what could be possibly wrong in here. Thanks;
Upvotes: 0
Views: 149
Reputation: 17
I think I found the answer, and I hope someone can verify.
The solution lies in the Sector()
print
functions. Those prints
tell us that the Sector(Classifier)
is being passed assets
of shape (3175,)
and mask
of shape (505,3175)
. This means that all assets (not jut the top 500) are being passed to the classifier, and it would be logical then that the compute
function returns an index
error.
So, what I did is that I re-wrote the part to get the Sector information as will be shown below.
Another trick there was that the assets
while of shape (3175,)
, the maximum value in the assets
was 3198
. Indeed though my sector acquiring function is looking only for the top 500 symbols, their index ranged from 0 to 3197. So the line np.zeros(max(ix)+1)
returns an np array of size 3198 (indexed from 0 to 3197). If maximum assets
index is 3198 and is used in the Sector(classifier)
, it will return error then because data[3198]
is out of bounds. So, I simply modify the line to be np.zeros(max(ix)+2)
, so that indices work.
Below are my sector acquiring function, and the classifier class that don't produce an error any more. I hope if someone with more zipline insight to have a look and advise if indexing in this manner (while it works syntactically) would be semantically correct.
Sector Acquiring Function
def get_sector(sym):
try:
sector = yf.Ticker(sym[1]).info['sector'] #use yahoo finance to get sector information
except:
sector = 'NoSector'
return [sym[0], sym[1], sector]
def build_sectors(prices, short_list=True):
tickers = prices.columns.values.tolist() #tickers from prices columns in the form of 'Equity n ([symbol])'
tick_seq = [int(str(t).split('(')[1].split('[')[0]) for t in tickers] #get list of equity index (n)
tick_name = [str(t).split('[')[1][:-2] for t in tickers] #get list of symbol names
tick_tuple = list(zip(tick_seq,tick_name)) #collect index and symbol name to list of tuples
tick_list = [list(i) for i in tick_tuple] #convert list of tuples to list of lists
try:
df = pd.read_csv('sectors.csv')
ix = df['symbol_seq'].tolist()
val = df['sec_id'].tolist()
sym_name = df['symbol'].tolist()
if sym_name != tick_name:
raise Exception('Symbol names stored on file are different from your dataset')
if short_list == False:
sym_seq = df['symbol_seq'].tolist()
if sym_seq != tick_seq:
raise Exception('Symbol sequences stored on file are different from your dataset')
except:
with ThreadPoolExecutor() as t:
sectors = list(t.map(get_sector, tick_list)) #returns list of lists, each inner list is [index, symbol, sector]
sectors_set = set([s[2] for s in sectors]) #get set of sectors
sectors_set.remove('NoSector')
sectors_set = {v:i for i,v in enumerate(sorted(sectors_set))} #assign a number for each sector
sectors_set['NoSector']=-1 # identify an unfound sector with -1
tmp = [s.append(sectors_set[s[2]]) for s in sectors] #append sector id number to each list; it becomes [index, symbol, sector, sec_no]
ix = [int(s[0]) for s in sectors] #extract again the indices of the symbols
val = [int(s[-1]) for s in sectors] #extract the sec_no for each index
df = pd.DataFrame({'symbol_seq':[s[0] for s in sectors], 'symbol':[s[1] for s in sectors],
'sector':[s[2] for s in sectors], 'sec_id': [s[3] for s in sectors]})
df.to_csv('sectors.csv')
if short_list:
sectors_np = np.array(val)
else:
sectors_np = np.zeros(max(ix)+2) #since equity indices are not sequential, we take the max found
sectors_np[:]=-1 #fill all as not found
sectors_np[ix]=val #fill the found sectors at their corresponding symbol index
sectors_np = sectors_np.astype(int) #convert the array to integer
return sectors_np
Sector Classifier Class
class Sector(Classifier):
dtype = int64_dtype
window_length = 0
inputs = ()
missing_value = -1
def __init__(self):
self.data = build_sectors(prices, short_list=False)
def _compute(self, arrays, dates, assets, mask):
return np.where(mask, self.data[assets], self.missing_value)
Upvotes: 0