Skip to content

Commit

Permalink
Merge pull request #91 from provectus/add_ml_tests
Browse files Browse the repository at this point in the history
[MAINTENANCE] added tests for ML models
  • Loading branch information
bvolodarskiy authored Jun 15, 2023
2 parents 05440e9 + 63d4f23 commit d912689
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 25 deletions.
2 changes: 1 addition & 1 deletion functions/data_test/Expectation_report_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def to_expectation_suite(
new_column_in_mapping = {}
try:
mapping_schema = mapping_config[suite_name]
except KeyError:
except (KeyError, TypeError):
mapping_schema = None

data_asset = data_context.get_datasource(
Expand Down
135 changes: 111 additions & 24 deletions functions/data_test/profiling.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import math

from ydata_profiling import ProfileReport
import os
import boto3
Expand All @@ -12,6 +14,8 @@
from great_expectations.data_context.types.base import (DataContextConfig,
S3StoreBackendDefaults)
import yaml
from scipy.stats import t

DEFAULT_CONFIG_FILE_PATH = "great_expectations/great_expectations.yml"

if os.environ['ENVIRONMENT'] == 'local':
Expand All @@ -37,33 +41,116 @@ def expectations_null(name, summary, batch, *args):
return name, summary, batch


def expectations_mean(name, summary, batch, *args):
n = summary["n"]
k = 0.99 * (summary["std"] / math.sqrt(n))
min_mean = summary["mean"] - k
max_mean = summary["mean"] + k
batch.expect_column_mean_to_be_between(
column=name, min_value=min_mean, max_value=max_mean)
return name, summary, batch


def expectations_median(name, summary, batch, *args):
raw_values = summary["value_counts_index_sorted"]
values = []
for key, v in raw_values.items():
key = [key] * v
values.extend(key)
q = 0.5
j = int(len(values) * q - 2.58 * math.sqrt(len(values) * q * (1 - q)))
k = int(len(values) * q + 2.58 * math.sqrt(len(values) * q * (1 - q)))
if j < len(values) and k < len(values):
min_median = values[j]
max_median = values[k]
batch.expect_column_median_to_be_between(
column=name, min_value=min_median, max_value=max_median)
return name, summary, batch


def expectations_stdev(name, summary, batch, *args):
n = summary["n"]
std = summary["std"]
confidence_level = 0.99
degrees_of_freedom = n - 1
alpha = 1 - confidence_level
t_critical = t.ppf(1 - alpha / 2, degrees_of_freedom)
margin_of_error = t_critical * (std / math.sqrt(n))
min_std = std - margin_of_error
max_std = std + margin_of_error
batch.expect_column_stdev_to_be_between(
column=name, min_value=min_std, max_value=max_std)
return name, summary, batch


def expectations_quantile(name, summary, batch, *args):
q_ranges = {
"quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
"value_ranges": [[summary["5%"], summary["25%"]], [summary["25%"], summary["50%"]],
[summary["50%"], summary["75%"]], [summary["75%"], summary["95%"]],
[summary["95%"], summary["max"]]]
}
batch.expect_column_quantile_values_to_be_between(
column=name, quantile_ranges=q_ranges)
return name, summary, batch


def expectations_z_score(name, summary, batch, *args):
mean = summary["mean"]
std = summary["std"]
maximum = summary["max"]
threshold = (maximum - mean) / std
if std != 0:
batch.expect_column_value_z_scores_to_be_less_than(
column=name, threshold=threshold, double_sided=True)
return name, summary, batch


class MyExpectationHandler(Handler):
def __init__(self, typeset, *args, **kwargs):
mapping = {
"Unsupported": [expectations_null,
],
"Categorical": [expectation_algorithms.categorical_expectations,
expectations_null,
],
"Text": [expectation_algorithms.categorical_expectations,
expectations_null
],
"Boolean": [expectations_null,
],
"Numeric": [generic_expectations_without_null,
expectations_null,
],
"URL": [expectation_algorithms.url_expectations,
expectations_null,
],
"File": [expectation_algorithms.file_expectations,
expectations_null, ],
"Path": [expectation_algorithms.path_expectations,
expectations_null, ],
"DateTime": [expectation_algorithms.datetime_expectations,
expectations_null, ],
"Image": [expectation_algorithms.image_expectations,
expectations_null, ],
"Unsupported": [
expectations_null,
],
"Categorical": [
expectation_algorithms.categorical_expectations,
expectations_null,
],
"Text": [
expectation_algorithms.categorical_expectations,
expectations_null],
"Boolean": [
expectations_null,
],
"Numeric": [
generic_expectations_without_null,
expectations_null,
expectation_algorithms.numeric_expectations,
expectations_mean,
expectations_median,
expectations_stdev,
expectations_quantile,
expectations_z_score],
"URL": [
expectation_algorithms.url_expectations,
expectations_null,
],
"File": [
expectation_algorithms.file_expectations,
expectations_null,
],
"Path": [
expectation_algorithms.path_expectations,
expectations_null,
],
"DateTime": [
expectation_algorithms.datetime_expectations,
expectations_null,
],
"Image": [
expectation_algorithms.image_expectations,
expectations_null,
],
}
super().__init__(mapping, typeset, *args, **kwargs)

Expand Down

0 comments on commit d912689

Please sign in to comment.