redhima
redhima

Reputation: 19

validation of csv using json schema in python

I want to perform the validation on the data. I have written the code using the pandas schema , instead of pandas schema how can I pass a json file which contains all the rules of validation in it and then apply it on the csv file.

That means to apply which rule on which column must be taken from the json file instead of the pandas schema and generate the error file.

def check_decimal(dec):
    try:
        Decimal(dec)
    except InvalidOperation:
        return False
    return True


def check_int(num):
    try:
        int(num)
    except ValueError:
        return False
    return True


def do_validation():
    # read the data
    data = pd.read_csv('data.csv')

    # define validation elements
    decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
    int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')]
    null_validation = [CustomElementValidation(lambda d: d is None, 'this field cannot be null')]

    # define validation schema

    schema = pandas_schema.Schema([
            Column('dec1', decimal_validation + null_validation),
            Column('dec2', decimal_validation),
            Column('dec3', decimal_validation),
            Column('dec4', decimal_validation),
            Column('dec5', decimal_validation),
            Column('dec6', decimal_validation),
            Column('dec7', decimal_validation),
            Column('company_id', int_validation + null_validation),
            Column('currency_id', int_validation + null_validation),
            Column('country_id', int_validation + null_validation)])


    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    data_clean = data.drop(index=errors_index_rows)

    # save data
    pd.DataFrame({'col':errors}).to_csv('errors55.csv')

Upvotes: 1

Views: 4926

Answers (1)

PyPingu
PyPingu

Reputation: 1747

So, I don't know anything really about pandas_schema, but if you have columns and their validators in a json like this:

{
    "dec1": ['decimal', 'null'],
    "dec2": ['decimal'],
    "dec3": ['decimal'],
    "dec4": ['decimal'],
    "dec5": ['decimal'],
    "dec6": ['decimal'],
    "dec7": ['decimal'],
    "company_id": ['int', 'null'],
    "currency_id": ['int', 'null'],
    "country_id": ['int', 'null']
}

Then you can use a dict of validators and a list comprehension to generate your Column objects for the Schema:

def check_decimal(dec):
    try:
        Decimal(dec)
    except InvalidOperation:
        return False
    return True


def check_int(num):
    try:
        int(num)
    except ValueError:
        return False
    return True


VALIDATORS = {
    'decimal': CustomElementValidation(lambda d: check_decimal(d), 'is not decimal'),
    'int': CustomElementValidation(lambda i: check_int(i), 'is not integer'),
    'null': CustomElementValidation(lambda d: d is None, 'this field cannot be null'),
}

def do_validation():
    # read the data
    data = pd.read_csv('data.csv')
    with open('my_json_schema.json', 'r') as my_json:
        json_schema = json.load(my_json)

    column_list = [Column(k, [VALIDATORS[v] for v in vals]) for k, vals in json_schema.items()]
    schema = pandas_schema.Schema(column_list)

    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    data_clean = data.drop(index=errors_index_rows)

    # save data
    pd.DataFrame({'col':errors}).to_csv('errors55.csv')

EDIT:

For using validators with arguments defined in the JSON you are going to need to change up both the JSON format and the code a bit. The following should work, but I can't test it myself.

{
    "dec1": [['decimal'], ['null']],
    "dec2": [['decimal'], ['range', 0, 10]],
    "dec3": [['decimal']],
    "dec4": [['decimal']],
    "dec5": [['decimal']],
    "dec6": [['decimal']],
    "dec7": [['decimal']],
    "company_id": [['int'], ['null']],
    "currency_id": [['int'], ['null']],
    "country_id": [['int'], ['null']]
}


def get_validator(opts)
    VALIDATORS = {
        'decimal': (CustomElementValidation, [lambda d: check_decimal(d), 'is not decimal']),
        'int': (CustomElementValidation, [lambda i: check_int(i), 'is not integer']),
        'null': (CustomElementValidation, [lambda d: d is None, 'this field cannot be null']),
        'range': (InRangeValidation, []),
    }
    func, args = VALIDATORS[opts[0]]
    args.extend(opts[1:])
    return func(*args)


def do_validation():
    # read the data
    data = pd.read_csv('data.csv')
    with open('my_json_schema.json', 'r') as my_json:
        json_schema = json.load(my_json)

    column_list = [Column(k, [get_validator(v) for v in vals]) for k, vals in json_schema.items()]
    schema = pandas_schema.Schema(column_list)

    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    data_clean = data.drop(index=errors_index_rows)

    # save data
    pd.DataFrame({'col':errors}).to_csv('errors55.csv')

Upvotes: 3

Related Questions