Reputation: 1038
I am trying to read series of files of about 190 columns, and would like provide each column with tits type (ie character, date, numeric, ect.), since vroom does not seem to read in negative numbers in the first row as numeric always. Hopefully this is a usable minimal example, which still uses the data. The data is from the CFTC's (Commodity Futures Trading Commission) commitment of traders reports.
My example is as follows:
library(vroom)
url2='[https://www.cftc.gov/files/dea/history/com_disagg_txt_2018.zip][1]'
download.file(url2,'CFTC_COT.zip')
unzip('CFTC_COT.zip', exdir = 'CFTC_COT')
data <- vroom('CFTC_COT/c_year.txt') #Woorks
data2 <- vroom( 'CFTC_COT/c_year.txt'
, col_types = c(col_character(),col_double(), col_date(format = ""),col_double(),rep(col_character(),3), rep(col_double(),177), rep( col_character(),6)))
spec(data)
The spec data is as follows:
cols(
Market_and_Exchange_Names = col_character(),
As_of_Date_In_Form_YYMMDD = col_double(),
`Report_Date_as_YYYY-MM-DD` = col_date(format = ""),
CFTC_Contract_Market_Code = col_character(),
CFTC_Market_Code = col_character(),
CFTC_Region_Code = col_character(),
CFTC_Commodity_Code = col_character(),
Open_Interest_All = col_double(),
Prod_Merc_Positions_Long_All = col_double(),
Prod_Merc_Positions_Short_All = col_double(),
Swap_Positions_Long_All = col_double(),
Swap__Positions_Short_All = col_double(),
Swap__Positions_Spread_All = col_double(),
M_Money_Positions_Long_All = col_double(),
M_Money_Positions_Short_All = col_double(),
M_Money_Positions_Spread_All = col_double(),
Other_Rept_Positions_Long_All = col_double(),
Other_Rept_Positions_Short_All = col_double(),
Other_Rept_Positions_Spread_All = col_double(),
Tot_Rept_Positions_Long_All = col_double(),
Tot_Rept_Positions_Short_All = col_double(),
NonRept_Positions_Long_All = col_double(),
NonRept_Positions_Short_All = col_double(),
Open_Interest_Old = col_double(),
Prod_Merc_Positions_Long_Old = col_double(),
Prod_Merc_Positions_Short_Old = col_double(),
Swap_Positions_Long_Old = col_double(),
Swap__Positions_Short_Old = col_double(),
Swap__Positions_Spread_Old = col_double(),
M_Money_Positions_Long_Old = col_double(),
M_Money_Positions_Short_Old = col_double(),
M_Money_Positions_Spread_Old = col_double(),
Other_Rept_Positions_Long_Old = col_double(),
Other_Rept_Positions_Short_Old = col_double(),
Other_Rept_Positions_Spread_Old = col_double(),
Tot_Rept_Positions_Long_Old = col_double(),
Tot_Rept_Positions_Short_Old = col_double(),
NonRept_Positions_Long_Old = col_double(),
NonRept_Positions_Short_Old = col_double(),
Open_Interest_Other = col_double(),
Prod_Merc_Positions_Long_Other = col_double(),
Prod_Merc_Positions_Short_Other = col_double(),
Swap_Positions_Long_Other = col_double(),
Swap__Positions_Short_Other = col_double(),
Swap__Positions_Spread_Other = col_double(),
M_Money_Positions_Long_Other = col_double(),
M_Money_Positions_Short_Other = col_double(),
M_Money_Positions_Spread_Other = col_double(),
Other_Rept_Positions_Long_Other = col_double(),
Other_Rept_Positions_Short_Other = col_double(),
Other_Rept_Positions_Spread_Other = col_double(),
Tot_Rept_Positions_Long_Other = col_double(),
Tot_Rept_Positions_Short_Other = col_double(),
NonRept_Positions_Long_Other = col_double(),
NonRept_Positions_Short_Other = col_double(),
Change_in_Open_Interest_All = col_character(),
Change_in_Prod_Merc_Long_All = col_character(),
Change_in_Prod_Merc_Short_All = col_character(),
Change_in_Swap_Long_All = col_character(),
Change_in_Swap_Short_All = col_character(),
Change_in_Swap_Spread_All = col_character(),
Change_in_M_Money_Long_All = col_character(),
Change_in_M_Money_Short_All = col_character(),
Change_in_M_Money_Spread_All = col_character(),
Change_in_Other_Rept_Long_All = col_character(),
Change_in_Other_Rept_Short_All = col_character(),
Change_in_Other_Rept_Spread_All = col_character(),
Change_in_Tot_Rept_Long_All = col_character(),
Change_in_Tot_Rept_Short_All = col_character(),
Change_in_NonRept_Long_All = col_character(),
Change_in_NonRept_Short_All = col_character(),
Pct_of_Open_Interest_All = col_double(),
Pct_of_OI_Prod_Merc_Long_All = col_double(),
Pct_of_OI_Prod_Merc_Short_All = col_double(),
Pct_of_OI_Swap_Long_All = col_double(),
Pct_of_OI_Swap_Short_All = col_double(),
Pct_of_OI_Swap_Spread_All = col_double(),
Pct_of_OI_M_Money_Long_All = col_double(),
Pct_of_OI_M_Money_Short_All = col_double(),
Pct_of_OI_M_Money_Spread_All = col_double(),
Pct_of_OI_Other_Rept_Long_All = col_double(),
Pct_of_OI_Other_Rept_Short_All = col_double(),
Pct_of_OI_Other_Rept_Spread_All = col_double(),
Pct_of_OI_Tot_Rept_Long_All = col_double(),
Pct_of_OI_Tot_Rept_Short_All = col_double(),
Pct_of_OI_NonRept_Long_All = col_double(),
Pct_of_OI_NonRept_Short_All = col_double(),
Pct_of_Open_Interest_Old = col_double(),
Pct_of_OI_Prod_Merc_Long_Old = col_double(),
Pct_of_OI_Prod_Merc_Short_Old = col_double(),
Pct_of_OI_Swap_Long_Old = col_double(),
Pct_of_OI_Swap_Short_Old = col_double(),
Pct_of_OI_Swap_Spread_Old = col_double(),
Pct_of_OI_M_Money_Long_Old = col_double(),
Pct_of_OI_M_Money_Short_Old = col_double(),
Pct_of_OI_M_Money_Spread_Old = col_double(),
Pct_of_OI_Other_Rept_Long_Old = col_double(),
Pct_of_OI_Other_Rept_Short_Old = col_double(),
Pct_of_OI_Other_Rept_Spread_Old = col_double(),
Pct_of_OI_Tot_Rept_Long_Old = col_double(),
Pct_of_OI_Tot_Rept_Short_Old = col_double(),
Pct_of_OI_NonRept_Long_Old = col_double(),
Pct_of_OI_NonRept_Short_Old = col_double(),
Pct_of_Open_Interest_Other = col_double(),
Pct_of_OI_Prod_Merc_Long_Other = col_double(),
Pct_of_OI_Prod_Merc_Short_Other = col_double(),
Pct_of_OI_Swap_Long_Other = col_double(),
Pct_of_OI_Swap_Short_Other = col_double(),
Pct_of_OI_Swap_Spread_Other = col_double(),
Pct_of_OI_M_Money_Long_Other = col_double(),
Pct_of_OI_M_Money_Short_Other = col_double(),
Pct_of_OI_M_Money_Spread_Other = col_double(),
Pct_of_OI_Other_Rept_Long_Other = col_double(),
Pct_of_OI_Other_Rept_Short_Other = col_double(),
Pct_of_OI_Other_Rept_Spread_Other = col_double(),
Pct_of_OI_Tot_Rept_Long_Other = col_double(),
Pct_of_OI_Tot_Rept_Short_Other = col_double(),
Pct_of_OI_NonRept_Long_Other = col_double(),
Pct_of_OI_NonRept_Short_Other = col_double(),
Traders_Tot_All = col_double(),
Traders_Prod_Merc_Long_All = col_character(),
Traders_Prod_Merc_Short_All = col_character(),
Traders_Swap_Long_All = col_character(),
Traders_Swap_Short_All = col_character(),
Traders_Swap_Spread_All = col_character(),
Traders_M_Money_Long_All = col_character(),
Traders_M_Money_Short_All = col_character(),
Traders_M_Money_Spread_All = col_character(),
Traders_Other_Rept_Long_All = col_character(),
Traders_Other_Rept_Short_All = col_character(),
Traders_Other_Rept_Spread_All = col_character(),
Traders_Tot_Rept_Long_All = col_double(),
Traders_Tot_Rept_Short_All = col_double(),
Traders_Tot_Old = col_double(),
Traders_Prod_Merc_Long_Old = col_character(),
Traders_Prod_Merc_Short_Old = col_character(),
Traders_Swap_Long_Old = col_character(),
Traders_Swap_Short_Old = col_character(),
Traders_Swap_Spread_Old = col_character(),
Traders_M_Money_Long_Old = col_character(),
Traders_M_Money_Short_Old = col_character(),
Traders_M_Money_Spread_Old = col_character(),
Traders_Other_Rept_Long_Old = col_character(),
Traders_Other_Rept_Short_Old = col_character(),
Traders_Other_Rept_Spread_Old = col_character(),
Traders_Tot_Rept_Long_Old = col_double(),
Traders_Tot_Rept_Short_Old = col_double(),
Traders_Tot_Other = col_double(),
Traders_Prod_Merc_Long_Other = col_double(),
Traders_Prod_Merc_Short_Other = col_double(),
Traders_Swap_Long_Other = col_character(),
Traders_Swap_Short_Other = col_character(),
Traders_Swap_Spread_Other = col_character(),
Traders_M_Money_Long_Other = col_character(),
Traders_M_Money_Short_Other = col_character(),
Traders_M_Money_Spread_Other = col_character(),
Traders_Other_Rept_Long_Other = col_double(),
Traders_Other_Rept_Short_Other = col_character(),
Traders_Other_Rept_Spread_Other = col_double(),
Traders_Tot_Rept_Long_Other = col_double(),
Traders_Tot_Rept_Short_Other = col_double(),
Conc_Gross_LE_4_TDR_Long_All = col_double(),
Conc_Gross_LE_4_TDR_Short_All = col_double(),
Conc_Gross_LE_8_TDR_Long_All = col_double(),
Conc_Gross_LE_8_TDR_Short_All = col_double(),
Conc_Net_LE_4_TDR_Long_All = col_double(),
Conc_Net_LE_4_TDR_Short_All = col_double(),
Conc_Net_LE_8_TDR_Long_All = col_double(),
Conc_Net_LE_8_TDR_Short_All = col_double(),
Conc_Gross_LE_4_TDR_Long_Old = col_double(),
Conc_Gross_LE_4_TDR_Short_Old = col_double(),
Conc_Gross_LE_8_TDR_Long_Old = col_double(),
Conc_Gross_LE_8_TDR_Short_Old = col_double(),
Conc_Net_LE_4_TDR_Long_Old = col_double(),
Conc_Net_LE_4_TDR_Short_Old = col_double(),
Conc_Net_LE_8_TDR_Long_Old = col_double(),
Conc_Net_LE_8_TDR_Short_Old = col_double(),
Conc_Gross_LE_4_TDR_Long_Other = col_double(),
Conc_Gross_LE_4_TDR_Short_Other = col_double(),
Conc_Gross_LE_8_TDR_Long_Other = col_double(),
Conc_Gross_LE_8_TDR_Short_Other = col_double(),
Conc_Net_LE_4_TDR_Long_Other = col_double(),
Conc_Net_LE_4_TDR_Short_Other = col_double(),
Conc_Net_LE_8_TDR_Long_Other = col_double(),
Conc_Net_LE_8_TDR_Short_Other = col_double(),
Contract_Units = col_character(),
CFTC_Contract_Market_Code_Quotes = col_character(),
CFTC_Market_Code_Quotes = col_character(),
CFTC_Commodity_Code_Quotes = col_character(),
CFTC_SubGroup_Code = col_character(),
FutOnly_or_Combined = col_character(),
.delim = ","
)
I can get the data to download, and be read in under the data file, but when I try to read it in by defining the column types, it fails with this message: Error: Unknown shortcut:
from the documentation of vroom, I can define each column with name = col_type(), but would like to define them in groups, since there are 177 columns that should be doubles in a row, but it starts having some columns as col_character().
To help clarify again, I'm looking for how to define column type with many columns in a row using rep() or a similar function and not defining each column's name, so I can read in many different files at once. Your help is greatly appreciated.
Upvotes: 1
Views: 551
Reputation: 1038
Thanks Polkas for getting me to think about this from a different angle. What worked well was:
data2 <- vroom( 'CFTC_COT/c_year.txt'
, col_types =
cols(
Market_and_Exchange_Names = col_character(),
As_of_Date_In_Form_YYMMDD = col_double(),
`Report_Date_as_YYYY-MM-DD` = col_date(format = ""),
CFTC_Contract_Market_Code = col_character(),
CFTC_Market_Code = col_character(),
CFTC_Region_Code = col_character(),
CFTC_Commodity_Code = col_character(),
Open_Interest_All = col_double(),
Prod_Merc_Positions_Long_All = col_double(),
Prod_Merc_Positions_Short_All = col_double(),
Swap_Positions_Long_All = col_double(),
Swap__Positions_Short_All = col_double(),
Swap__Positions_Spread_All = col_double(),
M_Money_Positions_Long_All = col_double(),
M_Money_Positions_Short_All = col_double(),
M_Money_Positions_Spread_All = col_double(),
Other_Rept_Positions_Long_All = col_double(),
Other_Rept_Positions_Short_All = col_double(),
Other_Rept_Positions_Spread_All = col_double(),
Tot_Rept_Positions_Long_All = col_double(),
Tot_Rept_Positions_Short_All = col_double(),
NonRept_Positions_Long_All = col_double(),
NonRept_Positions_Short_All = col_double(),
Open_Interest_Old = col_double(),
Prod_Merc_Positions_Long_Old = col_double(),
Prod_Merc_Positions_Short_Old = col_double(),
Swap_Positions_Long_Old = col_double(),
Swap__Positions_Short_Old = col_double(),
Swap__Positions_Spread_Old = col_double(),
M_Money_Positions_Long_Old = col_double(),
M_Money_Positions_Short_Old = col_double(),
M_Money_Positions_Spread_Old = col_double(),
Other_Rept_Positions_Long_Old = col_double(),
Other_Rept_Positions_Short_Old = col_double(),
Other_Rept_Positions_Spread_Old = col_double(),
Tot_Rept_Positions_Long_Old = col_double(),
Tot_Rept_Positions_Short_Old = col_double(),
NonRept_Positions_Long_Old = col_double(),
NonRept_Positions_Short_Old = col_double(),
Open_Interest_Other = col_double(),
Prod_Merc_Positions_Long_Other = col_double(),
Prod_Merc_Positions_Short_Other = col_double(),
Swap_Positions_Long_Other = col_double(),
Swap__Positions_Short_Other = col_double(),
Swap__Positions_Spread_Other = col_double(),
M_Money_Positions_Long_Other = col_double(),
M_Money_Positions_Short_Other = col_double(),
M_Money_Positions_Spread_Other = col_double(),
Other_Rept_Positions_Long_Other = col_double(),
Other_Rept_Positions_Short_Other = col_double(),
Other_Rept_Positions_Spread_Other = col_double(),
Tot_Rept_Positions_Long_Other = col_double(),
Tot_Rept_Positions_Short_Other = col_double(),
NonRept_Positions_Long_Other = col_double(),
NonRept_Positions_Short_Other = col_double(),
Change_in_Open_Interest_All = col_double(),
Change_in_Prod_Merc_Long_All = col_double(),
Change_in_Prod_Merc_Short_All = col_double(),
Change_in_Swap_Long_All = col_double(),
Change_in_Swap_Short_All = col_double(),
Change_in_Swap_Spread_All = col_double(),
Change_in_M_Money_Long_All = col_double(),
Change_in_M_Money_Short_All = col_double(),
Change_in_M_Money_Spread_All = col_double(),
Change_in_Other_Rept_Long_All = col_double(),
Change_in_Other_Rept_Short_All = col_double(),
Change_in_Other_Rept_Spread_All = col_double(),
Change_in_Tot_Rept_Long_All = col_double(),
Change_in_Tot_Rept_Short_All = col_double(),
Change_in_NonRept_Long_All = col_double(),
Change_in_NonRept_Short_All = col_double(),
Pct_of_Open_Interest_All = col_double(),
Pct_of_OI_Prod_Merc_Long_All = col_double(),
Pct_of_OI_Prod_Merc_Short_All = col_double(),
Pct_of_OI_Swap_Long_All = col_double(),
Pct_of_OI_Swap_Short_All = col_double(),
Pct_of_OI_Swap_Spread_All = col_double(),
Pct_of_OI_M_Money_Long_All = col_double(),
Pct_of_OI_M_Money_Short_All = col_double(),
Pct_of_OI_M_Money_Spread_All = col_double(),
Pct_of_OI_Other_Rept_Long_All = col_double(),
Pct_of_OI_Other_Rept_Short_All = col_double(),
Pct_of_OI_Other_Rept_Spread_All = col_double(),
Pct_of_OI_Tot_Rept_Long_All = col_double(),
Pct_of_OI_Tot_Rept_Short_All = col_double(),
Pct_of_OI_NonRept_Long_All = col_double(),
Pct_of_OI_NonRept_Short_All = col_double(),
Pct_of_Open_Interest_Old = col_double(),
Pct_of_OI_Prod_Merc_Long_Old = col_double(),
Pct_of_OI_Prod_Merc_Short_Old = col_double(),
Pct_of_OI_Swap_Long_Old = col_double(),
Pct_of_OI_Swap_Short_Old = col_double(),
Pct_of_OI_Swap_Spread_Old = col_double(),
Pct_of_OI_M_Money_Long_Old = col_double(),
Pct_of_OI_M_Money_Short_Old = col_double(),
Pct_of_OI_M_Money_Spread_Old = col_double(),
Pct_of_OI_Other_Rept_Long_Old = col_double(),
Pct_of_OI_Other_Rept_Short_Old = col_double(),
Pct_of_OI_Other_Rept_Spread_Old = col_double(),
Pct_of_OI_Tot_Rept_Long_Old = col_double(),
Pct_of_OI_Tot_Rept_Short_Old = col_double(),
Pct_of_OI_NonRept_Long_Old = col_double(),
Pct_of_OI_NonRept_Short_Old = col_double(),
Pct_of_Open_Interest_Other = col_double(),
Pct_of_OI_Prod_Merc_Long_Other = col_double(),
Pct_of_OI_Prod_Merc_Short_Other = col_double(),
Pct_of_OI_Swap_Long_Other = col_double(),
Pct_of_OI_Swap_Short_Other = col_double(),
Pct_of_OI_Swap_Spread_Other = col_double(),
Pct_of_OI_M_Money_Long_Other = col_double(),
Pct_of_OI_M_Money_Short_Other = col_double(),
Pct_of_OI_M_Money_Spread_Other = col_double(),
Pct_of_OI_Other_Rept_Long_Other = col_double(),
Pct_of_OI_Other_Rept_Short_Other = col_double(),
Pct_of_OI_Other_Rept_Spread_Other = col_double(),
Pct_of_OI_Tot_Rept_Long_Other = col_double(),
Pct_of_OI_Tot_Rept_Short_Other = col_double(),
Pct_of_OI_NonRept_Long_Other = col_double(),
Pct_of_OI_NonRept_Short_Other = col_double(),
Traders_Tot_All = col_double(),
Traders_Prod_Merc_Long_All = col_double(),
Traders_Prod_Merc_Short_All = col_double(),
Traders_Swap_Long_All = col_double(),
Traders_Swap_Short_All = col_double(),
Traders_Swap_Spread_All = col_double(),
Traders_M_Money_Long_All = col_double(),
Traders_M_Money_Short_All = col_double(),
Traders_M_Money_Spread_All = col_double(),
Traders_Other_Rept_Long_All = col_double(),
Traders_Other_Rept_Short_All = col_double(),
Traders_Other_Rept_Spread_All = col_double(),
Traders_Tot_Rept_Long_All = col_double(),
Traders_Tot_Rept_Short_All = col_double(),
Traders_Tot_Old = col_double(),
Traders_Prod_Merc_Long_Old = col_double(),
Traders_Prod_Merc_Short_Old = col_double(),
Traders_Swap_Long_Old = col_double(),
Traders_Swap_Short_Old = col_double(),
Traders_Swap_Spread_Old = col_double(),
Traders_M_Money_Long_Old = col_double(),
Traders_M_Money_Short_Old = col_double(),
Traders_M_Money_Spread_Old = col_double(),
Traders_Other_Rept_Long_Old = col_double(),
Traders_Other_Rept_Short_Old = col_double(),
Traders_Other_Rept_Spread_Old = col_double(),
Traders_Tot_Rept_Long_Old = col_double(),
Traders_Tot_Rept_Short_Old = col_double(),
Traders_Tot_Other = col_double(),
Traders_Prod_Merc_Long_Other = col_double(),
Traders_Prod_Merc_Short_Other = col_double(),
Traders_Swap_Long_Other = col_double(),
Traders_Swap_Short_Other = col_double(),
Traders_Swap_Spread_Other = col_double(),
Traders_M_Money_Long_Other = col_double(),
Traders_M_Money_Short_Other = col_double(),
Traders_M_Money_Spread_Other = col_double(),
Traders_Other_Rept_Long_Other = col_double(),
Traders_Other_Rept_Short_Other = col_double(),
Traders_Other_Rept_Spread_Other = col_double(),
Traders_Tot_Rept_Long_Other = col_double(),
Traders_Tot_Rept_Short_Other = col_double(),
Conc_Gross_LE_4_TDR_Long_All = col_double(),
Conc_Gross_LE_4_TDR_Short_All = col_double(),
Conc_Gross_LE_8_TDR_Long_All = col_double(),
Conc_Gross_LE_8_TDR_Short_All = col_double(),
Conc_Net_LE_4_TDR_Long_All = col_double(),
Conc_Net_LE_4_TDR_Short_All = col_double(),
Conc_Net_LE_8_TDR_Long_All = col_double(),
Conc_Net_LE_8_TDR_Short_All = col_double(),
Conc_Gross_LE_4_TDR_Long_Old = col_double(),
Conc_Gross_LE_4_TDR_Short_Old = col_double(),
Conc_Gross_LE_8_TDR_Long_Old = col_double(),
Conc_Gross_LE_8_TDR_Short_Old = col_double(),
Conc_Net_LE_4_TDR_Long_Old = col_double(),
Conc_Net_LE_4_TDR_Short_Old = col_double(),
Conc_Net_LE_8_TDR_Long_Old = col_double(),
Conc_Net_LE_8_TDR_Short_Old = col_double(),
Conc_Gross_LE_4_TDR_Long_Other = col_double(),
Conc_Gross_LE_4_TDR_Short_Other = col_double(),
Conc_Gross_LE_8_TDR_Long_Other = col_double(),
Conc_Gross_LE_8_TDR_Short_Other = col_double(),
Conc_Net_LE_4_TDR_Long_Other = col_double(),
Conc_Net_LE_4_TDR_Short_Other = col_double(),
Conc_Net_LE_8_TDR_Long_Other = col_double(),
Conc_Net_LE_8_TDR_Short_Other = col_double(),
Contract_Units = col_character(),
CFTC_Contract_Market_Code_Quotes = col_character(),
CFTC_Market_Code_Quotes = col_character(),
CFTC_Commodity_Code_Quotes = col_character(),
CFTC_SubGroup_Code = col_character(),
FutOnly_or_Combined = col_character(),
.delim = ","
)
)
I just copied the spec() data and pasted it into col_types. Then in the areas that I have an issue with it reading in as a character, I did a find replace and switched them to col_double(). It worked better than what I wanted since if the column's change it will give an error.
It does make for a long nasty bit of code though:).
Upvotes: 0
Reputation: 4184
So the problem is as always with date format (joke:p). Thus we read it as character and then format with lubridate
.
data2 <- vroom( 'CFTC_COT/c_year.txt'
, col_types = c(col_character(),col_double(), col_character(),col_double(),rep(col_character(),3), rep(col_double(),177), rep( col_character(),6)))
data2$`Report_Date_as_YYYY-MM-DD` = lubridate::ymd(data2$`Report_Date_as_YYYY-MM-DD`)
Upvotes: 1