Reputation: 19
I want to perform the validation on the data. I have written the code using the pandas schema , instead of pandas schema how can I pass a json file which contains all the rules of validation in it and then apply it on the csv file.
That means to apply which rule on which column must be taken from the json file instead of the pandas schema and generate the error file.
def check_decimal(dec):
try:
Decimal(dec)
except InvalidOperation:
return False
return True
def check_int(num):
try:
int(num)
except ValueError:
return False
return True
def do_validation():
# read the data
data = pd.read_csv('data.csv')
# define validation elements
decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')]
null_validation = [CustomElementValidation(lambda d: d is None, 'this field cannot be null')]
# define validation schema
schema = pandas_schema.Schema([
Column('dec1', decimal_validation + null_validation),
Column('dec2', decimal_validation),
Column('dec3', decimal_validation),
Column('dec4', decimal_validation),
Column('dec5', decimal_validation),
Column('dec6', decimal_validation),
Column('dec7', decimal_validation),
Column('company_id', int_validation + null_validation),
Column('currency_id', int_validation + null_validation),
Column('country_id', int_validation + null_validation)])
# apply validation
errors = schema.validate(data)
errors_index_rows = [e.row for e in errors]
data_clean = data.drop(index=errors_index_rows)
# save data
pd.DataFrame({'col':errors}).to_csv('errors55.csv')
Upvotes: 1
Views: 4926
Reputation: 1747
So, I don't know anything really about pandas_schema
, but if you have columns and their validators in a json like this:
{
"dec1": ['decimal', 'null'],
"dec2": ['decimal'],
"dec3": ['decimal'],
"dec4": ['decimal'],
"dec5": ['decimal'],
"dec6": ['decimal'],
"dec7": ['decimal'],
"company_id": ['int', 'null'],
"currency_id": ['int', 'null'],
"country_id": ['int', 'null']
}
Then you can use a dict of validators and a list comprehension to generate your Column
objects for the Schema
:
def check_decimal(dec):
try:
Decimal(dec)
except InvalidOperation:
return False
return True
def check_int(num):
try:
int(num)
except ValueError:
return False
return True
VALIDATORS = {
'decimal': CustomElementValidation(lambda d: check_decimal(d), 'is not decimal'),
'int': CustomElementValidation(lambda i: check_int(i), 'is not integer'),
'null': CustomElementValidation(lambda d: d is None, 'this field cannot be null'),
}
def do_validation():
# read the data
data = pd.read_csv('data.csv')
with open('my_json_schema.json', 'r') as my_json:
json_schema = json.load(my_json)
column_list = [Column(k, [VALIDATORS[v] for v in vals]) for k, vals in json_schema.items()]
schema = pandas_schema.Schema(column_list)
# apply validation
errors = schema.validate(data)
errors_index_rows = [e.row for e in errors]
data_clean = data.drop(index=errors_index_rows)
# save data
pd.DataFrame({'col':errors}).to_csv('errors55.csv')
EDIT:
For using validators with arguments defined in the JSON you are going to need to change up both the JSON format and the code a bit. The following should work, but I can't test it myself.
{
"dec1": [['decimal'], ['null']],
"dec2": [['decimal'], ['range', 0, 10]],
"dec3": [['decimal']],
"dec4": [['decimal']],
"dec5": [['decimal']],
"dec6": [['decimal']],
"dec7": [['decimal']],
"company_id": [['int'], ['null']],
"currency_id": [['int'], ['null']],
"country_id": [['int'], ['null']]
}
def get_validator(opts)
VALIDATORS = {
'decimal': (CustomElementValidation, [lambda d: check_decimal(d), 'is not decimal']),
'int': (CustomElementValidation, [lambda i: check_int(i), 'is not integer']),
'null': (CustomElementValidation, [lambda d: d is None, 'this field cannot be null']),
'range': (InRangeValidation, []),
}
func, args = VALIDATORS[opts[0]]
args.extend(opts[1:])
return func(*args)
def do_validation():
# read the data
data = pd.read_csv('data.csv')
with open('my_json_schema.json', 'r') as my_json:
json_schema = json.load(my_json)
column_list = [Column(k, [get_validator(v) for v in vals]) for k, vals in json_schema.items()]
schema = pandas_schema.Schema(column_list)
# apply validation
errors = schema.validate(data)
errors_index_rows = [e.row for e in errors]
data_clean = data.drop(index=errors_index_rows)
# save data
pd.DataFrame({'col':errors}).to_csv('errors55.csv')
Upvotes: 3