Source code for datavalidation.validation.validate

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
validate
----------------------------------

Module with the main functionality of the validation process. It validates BikeGeometries and GeometryParameters, and
calculates the GeometryParameters missing if possible.

Author: Javier Chiyah, Heriot-Watt University, 2019
"""


import logging

from ..core import BikeGeometry, GeometryParameter
from .equations import get_equations, solve_equation
from .constraints import check_parameter_constraints, get_parameter_deviation


[docs]def validate_bike_geometry(bike_geometry: BikeGeometry): """ Validates a BikeGeometry. Be careful as it modifies the BikeGeometry in place! After calling this function, the BikeGeometry has been modified in the following way: - The GeometryParameters from inside the BikeGeometry that could be validated now have confidence values. - Some GeometryParameters may be deemed invalid due to low confidence values. - Some GeometryParameters may have new calculated values, even if the previous values were valid. - Some GeometryParameters without a value may have a new value calculated by deriving it from others. - The BikeGeometry can now be queried for a confidence value (get_confidence_score()). :param bike_geometry: BikeGeometry object to validate :return: None """ change_flag = True # loop as long as the list of parameters is increasing (they are being calculated) while change_flag: missing_len = len(bike_geometry.get_missing_parameter_list()) calculate_missing_parameters(bike_geometry) # set change_flag to False if the list of parameters didn't increase change_flag = missing_len != len(bike_geometry.get_missing_parameter_list()) # note that this loop can be executed in parallel and it is likely to be the most expensive loop of the package for param in bike_geometry.get_parameter_list(): validate_geometry_parameter(param, bike_geometry) # calculate parameters again to give values to invalid parameters # no need to do this anymore as validate will add the parameter's calculated values by default now # calculate_missing_parameters(bike_geometry) logging.info("BikeGeometry validated")
[docs]def validate_geometry_parameter(parameter: GeometryParameter, bike_geometry: BikeGeometry): """ Validates a GeometryParameter of the BikeGeometry. It modifies the GeometryParameter but not the BikeGeometry. After calling this function in a GeometryParameter, it will: - Have a confidence value if we were able to validate it. - Have a new calculated value if we were able to validate it. :param parameter: GeometryParameter inside the BikeGeometry :param bike_geometry: the BikeGeometry :return: None """ if not parameter.is_number() or parameter.calculated_value is not None: # either not a number or it already has a calculated value # if the parameter reached this with a calculated value, it is skipped, as we do not validate those parameters # calculated by the `calculate_missing_parameters()` function return None equation_list = get_equations(parameter.name, bike_geometry.get_parameter_list()) if len(equation_list) > 0: for formula in equation_list: new_values = solve_equation(formula, parameter.name, bike_geometry) if len(new_values) > 0: if parameter.value is None: # parameter has no value, this should never happen, but left here for legacy purposes logging.error("GeometryParameter('{}') has reached a deprecated code section in " "validate.validate_geometry_parameter".format(parameter.name)) parameter.set_calculated_value(new_values) elif parameter.calculated_value is None: # only valid branch at this point as the rest are deprecated param_values = parameter.normalised_value if not isinstance(param_values, list): param_values = [param_values] # keep a list with similarity similarity_list = [] for param_val in param_values: similarity_list.extend([(get_value_similarity(param_val, new_val), new_val) for new_val in new_values]) if len(similarity_list) > 0: best_case = max(similarity_list, key=lambda l: l[0]) # adjust confidence from the average of the list calculated earlier parameter.set_confidence(sum(map(lambda l: l[0], similarity_list)) / len(similarity_list)) # set the calculated value to be the best value found parameter.set_calculated_value(best_case[1], change_confidence=False) else: # it should never reach this code either _set_confidence_from_deviation(parameter) else: # parameter could not be validated using maths, so use deviation from average statistics instead # note how invert is True, so we are getting (1 - deviation) for confidence _set_confidence_from_deviation(parameter)
[docs]def calculate_missing_parameters(bike_geometry: BikeGeometry, include_invalid: bool = True): """ Calculates missing GeometryParameters of a BikeGeometry if possible. It modifies the BikeGeometry in place! Missing parameters are defined in the function BikeGeometry.get_missing_parameter_list(). It also includes invalid parameters, from the function get_invalid_parameters(). :param bike_geometry: the BikeGeometry :param include_invalid: if it should calculate invalid parameters too, default is True :return: None """ # get a list with all the missing or invalid parameters (only the names) parameter_list = bike_geometry.get_missing_parameter_list() if include_invalid: parameter_list.extend(get_invalid_parameters(bike_geometry)) # this loop can be executed in parallel if needed as it only modifies the parameter given in the arguments for param in parameter_list: calculate_parameter(param, bike_geometry)
[docs]def calculate_parameter(parameter_name: str, bike_geometry: BikeGeometry): """ Calculates the value of a parameter and sets it confidence value if it can derived from the geometry statistics. It modifies the GeometryParameter given in the BikeGeometry (or creates one if it does not exists). :param parameter_name: name of the GeometryParameter :param bike_geometry: the BikeGeometry :return: None """ parameter = bike_geometry.get_parameter(parameter_name) # check if parameter exists if parameter is None: parameter = GeometryParameter(parameter_name, None) bike_geometry.set_parameter(parameter) # check if it has already been calculated (e.g. in a previous iteration) elif parameter.calculated_value is not None: # skip this iteration return # set confidence from deviation first to save previous confidence (if it can be calculated) _set_confidence_from_deviation(parameter) equation_list = get_equations(parameter.name, bike_geometry.get_parameter_list()) for formula in equation_list: new_values = solve_equation(formula, parameter.name, bike_geometry) if len(new_values) > 0: parameter.set_calculated_value(new_values, change_confidence=True)
[docs]def get_invalid_parameters(bike_geometry: BikeGeometry) -> list: """ Gets a list with the names of invalid GeometryParameters of a BikeGeometry that are numbers (that can be calculated). An invalid parameters is defined by the function is_parameter_invalid(). Note: this function should be inside BikeGeometry, but is_parameter_invalid() uses the geometry constraints, thus it is best to keep the core of the datavalidation module (BikeGeometry) separated from the constraints module. :param bike_geometry: the BikeGeometry :return: list of invalid parameters (list of str) """ invalid_params = [] for param in bike_geometry.get_parameter_list(): if param.is_number() and is_parameter_invalid(param, bike_geometry): # this means that the parameter is likely wrong invalid_params.append(param.name) return invalid_params
[docs]def get_value_similarity(value1, value2) -> float: """ Gets the similarity between two values. The similarity is a number between 0 and 1 as a percentage where 1 means that the values are the same and 0 means that the values are completely different. This function is duplicated in the constraints module. A common utility file could improve this... :param value1: value :param value2: value :return: (0 to 1) percentage float of how close value1 is to value2 """ if isinstance(value2, list): # if value2 is a list, then return the one with least similarity from the whole similarity list val_list = [get_value_similarity(value1, x) for x in value2] return min(val_list) return float(min([value1, value2])) / float(max([value1, value2]))
[docs]def is_parameter_invalid(parameter: GeometryParameter, bike_geometry: BikeGeometry) -> bool: """ Checks if a GeometryParameter is invalid. A GeometryParameter is invalid if: - The parameter has very low confidence. - The parameter does not satisfy geometry constraints. - The parameter is too deviated from normal constraints. Note: this function should be inside BikeGeometry, but it uses the geometry constraints, thus it is best to keep the core of the datavalidation module (BikeGeometry) separated from the constraints module. :param parameter: GeometryParameter to check :param bike_geometry: BikeGeometry to make the comparison :return: bool, True if the parameter IS invalid """ # TODO: change the access to private member _PARAMETER_THRESHOLD return not check_parameter_constraints(parameter.name, bike_geometry) or ( parameter.confidence is not None and parameter.confidence < bike_geometry._PARAMETER_THRESHOLD)
def _set_confidence_from_deviation(parameter: GeometryParameter): """ Sets the confidence value of the parameter based on its deviation from the normal geometry statistics. It is safe to call even when the deviation cannot be calculated. :param parameter: GeometryParameter to set confidence :return: None """ deviation = get_parameter_deviation(parameter, invert=True) if deviation is not None: parameter.set_confidence(deviation)