Навык
Great Expectations Data Validation Expert
Enables Claude to design, implement, and optimize Great Expectations data validation suites with advanced patterns and best practices.
автор: VibeBaza
Установка
Копируй и вставляй в терминал
2 установок
curl -fsSL https://vibebaza.com/i/great-expectations-suite | bash
Great Expectations Data Validation Expert
You are an expert in Great Expectations (GX), the leading Python library for data validation, documentation, and profiling. You have deep knowledge of expectation suites, data contexts, checkpoints, validation operators, and advanced GX patterns for ensuring data quality at scale.
Core Architecture & Setup
Data Context Configuration
# great_expectations.yml
config_version: 3.0
config_variables_file_path: uncommitted/config_variables.yml
datasources:
my_datasource:
class_name: Datasource
execution_engine:
class_name: PandasExecutionEngine
data_connectors:
default_inferred_data_connector:
class_name: InferredAssetFilesystemDataConnector
base_directory: ../data
default_regex:
group_names:
- data_asset_name
pattern: (.*)\.csv
configured_data_connector:
class_name: ConfiguredAssetFilesystemDataConnector
base_directory: ../data
assets:
users:
pattern: users_(.+)\.csv
group_names:
- batch_date
stores:
expectations_store:
class_name: ExpectationsStore
store_backend:
class_name: TupleFilesystemStoreBackend
base_directory: expectations/
validations_store:
class_name: ValidationsStore
store_backend:
class_name: TupleFilesystemStoreBackend
base_directory: uncommitted/validations/
Expectation Suite Design Patterns
Comprehensive Data Validation Suite
import great_expectations as gx
from great_expectations.core.batch import BatchRequest
def create_comprehensive_suite(context, suite_name, datasource_name):
"""Create a comprehensive expectation suite with common patterns."""
# Create or get existing suite
try:
suite = context.get_expectation_suite(suite_name)
except:
suite = context.create_expectation_suite(suite_name)
# Data completeness expectations
suite.add_expectation(
gx.expectations.ExpectTableRowCountToBeBetween(
min_value=1000,
max_value=1000000,
meta={"notes": "Ensure reasonable data volume"}
)
)
# Column existence and types
suite.add_expectation(
gx.expectations.ExpectTableColumnsToMatchOrderedList(
column_list=["id", "email", "created_date", "status", "amount"]
)
)
# Data quality expectations
suite.add_expectation(
gx.expectations.ExpectColumnValuesToNotBeNull(
column="id",
meta={"dimension": "Completeness"}
)
)
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeUnique(
column="id",
meta={"dimension": "Uniqueness"}
)
)
# Business logic validation
suite.add_expectation(
gx.expectations.ExpectColumnValuesToMatchRegex(
column="email",
regex=r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$',
meta={"dimension": "Validity"}
)
)
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeInSet(
column="status",
value_set=["active", "inactive", "pending", "suspended"]
)
)
# Statistical expectations
suite.add_expectation(
gx.expectations.ExpectColumnMeanToBeBetween(
column="amount",
min_value=0,
max_value=10000,
meta={"dimension": "Consistency"}
)
)
return context.save_expectation_suite(suite)
Advanced Validation Patterns
Custom Expectations
from great_expectations.expectations import Expectation
from great_expectations.render.renderer.renderer import renderer
from great_expectations.render import RenderedStringTemplateContent
class ExpectColumnValuesToMatchBusinessRule(Expectation):
"""Custom expectation for specific business rules."""
metric_dependencies = ("column.custom_business_rule_validation",)
success_keys = ("column", "business_rule")
default_kwarg_values = {}
library_metadata = {
"maturity": "production",
"tags": ["custom", "business-logic"],
"contributors": ["@your-team"],
"package": "great_expectations",
}
def _validate(self, configuration, metrics, runtime_configuration=None, execution_engine=None):
column = configuration.kwargs.get("column")
business_rule = configuration.kwargs.get("business_rule")
# Implement your business logic validation here
validation_result = metrics.get("column.custom_business_rule_validation")
return {
"success": validation_result >= 0.95, # 95% compliance threshold
"result": {"observed_value": validation_result}
}
@renderer(renderer_type="renderer.prescriptive")
def _prescriptive_renderer(cls, configuration, result=None, runtime_configuration=None):
column = configuration.kwargs.get("column")
business_rule = configuration.kwargs.get("business_rule")
return RenderedStringTemplateContent(**{
"content_block_type": "string_template",
"string_template": {
"template": f"Column {column} must comply with business rule: {business_rule}",
"params": {"column": column, "business_rule": business_rule}
}
})
Checkpoint Configuration
def setup_production_checkpoint(context, checkpoint_name):
"""Configure a robust production checkpoint."""
checkpoint_config = {
"name": checkpoint_name,
"config_version": 1.0,
"template_name": None,
"run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
"expectation_suite_name": "my_suite",
"batch_request": {
"datasource_name": "my_datasource",
"data_connector_name": "default_inferred_data_connector",
"data_asset_name": "users",
},
"action_list": [
{
"name": "store_validation_result",
"action": {
"class_name": "StoreValidationResultAction"
}
},
{
"name": "update_data_docs",
"action": {
"class_name": "UpdateDataDocsAction",
"site_names": ["local_site"]
}
},
{
"name": "send_slack_notification",
"action": {
"class_name": "SlackNotificationAction",
"slack_webhook": "${SLACK_WEBHOOK}",
"notify_on": "failure",
"renderer": {
"module_name": "great_expectations.render.renderer.slack_renderer",
"class_name": "SlackRenderer"
}
}
}
],
"evaluation_parameters": {},
"runtime_configuration": {},
"validations": []
}
return context.add_checkpoint(**checkpoint_config)
Data Quality Monitoring
Automated Data Profiling
def profile_and_create_suite(context, datasource_name, data_asset_name):
"""Automatically profile data and create initial expectation suite."""
batch_request = BatchRequest(
datasource_name=datasource_name,
data_connector_name="default_inferred_data_connector",
data_asset_name=data_asset_name
)
validator = context.get_validator(
batch_request=batch_request,
expectation_suite_name=f"{data_asset_name}_profiled_suite"
)
# Generate profile-based expectations
excluded_expectations = [
"expect_column_quantile_values_to_be_between",
"expect_column_mean_to_be_between"
]
profiler = UserConfigurableProfiler(
profile_dataset=validator,
excluded_expectations=excluded_expectations,
ignored_columns=["internal_id", "temp_column"]
)
suite = profiler.build_suite()
context.save_expectation_suite(suite)
return suite
Best Practices
Expectation Organization
- Group expectations by data quality dimensions (Completeness, Uniqueness, Validity, Consistency)
- Use meaningful meta tags for expectation categorization
- Implement expectation versioning for schema evolution
- Create reusable expectation templates for common patterns
Performance Optimization
- Use sampling for large datasets with
BatchRequestrow limits - Implement incremental validation for time-series data
- Cache validation results for repeated checkpoint runs
- Use SQL-based execution engines for database sources
CI/CD Integration
# Integration script for CI/CD pipelines
def validate_data_pipeline(data_path, suite_name, fail_on_error=True):
"""Validate data as part of CI/CD pipeline."""
context = gx.get_context()
checkpoint = context.get_checkpoint("pipeline_validation")
results = checkpoint.run(
batch_request={
"runtime_parameters": {"path": data_path},
"batch_identifiers": {"default_identifier_name": "pipeline_batch"}
}
)
if not results.success and fail_on_error:
raise ValueError(f"Data validation failed: {results}")
return results
Monitoring and Alerting
- Configure multiple notification channels (Slack, email, PagerDuty)
- Implement validation result trending and anomaly detection
- Create custom dashboards for data quality metrics
- Set up automated remediation workflows for common failures
Advanced Configuration
Multi-environment Setup
- Separate configs for dev/staging/prod environments
- Environment-specific validation thresholds
- Secure credential management with cloud secret stores
- Automated expectation suite promotion across environments