data-quality-auditor
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChineseData Quality Auditor
数据质量审核工具
Comprehensive data quality assessment for CSV/Excel datasets.
针对CSV/Excel数据集的全面数据质量评估。
Features
功能特性
- Completeness: Missing values analysis
- Uniqueness: Duplicate detection
- Validity: Type validation and constraints
- Consistency: Pattern and format checks
- Quality Score: Overall data quality metric
- Reports: Detailed HTML/JSON reports
- 完整性:缺失值分析
- 唯一性:重复项检测
- 有效性:类型验证与约束检查
- 一致性:模式与格式检查
- 质量得分:整体数据质量指标
- 报告:详细的HTML/JSON报告
Quick Start
快速开始
python
from data_quality_auditor import DataQualityAuditor
auditor = DataQualityAuditor()
auditor.load_csv("customers.csv")python
from data_quality_auditor import DataQualityAuditor
auditor = DataQualityAuditor()
auditor.load_csv("customers.csv")Run full audit
Run full audit
report = auditor.audit()
print(f"Quality Score: {report['quality_score']}/100")
report = auditor.audit()
print(f"Quality Score: {report['quality_score']}/100")
Check specific issues
Check specific issues
missing = auditor.check_missing()
duplicates = auditor.check_duplicates()
undefinedmissing = auditor.check_missing()
duplicates = auditor.check_duplicates()
undefinedCLI Usage
CLI 使用方法
bash
undefinedbash
undefinedFull audit
Full audit
python data_quality_auditor.py --input data.csv
python data_quality_auditor.py --input data.csv
Generate HTML report
Generate HTML report
python data_quality_auditor.py --input data.csv --report report.html
python data_quality_auditor.py --input data.csv --report report.html
Check specific aspects
Check specific aspects
python data_quality_auditor.py --input data.csv --missing
python data_quality_auditor.py --input data.csv --duplicates
python data_quality_auditor.py --input data.csv --types
python data_quality_auditor.py --input data.csv --missing
python data_quality_auditor.py --input data.csv --duplicates
python data_quality_auditor.py --input data.csv --types
JSON output
JSON output
python data_quality_auditor.py --input data.csv --json
python data_quality_auditor.py --input data.csv --json
Validate against rules
Validate against rules
python data_quality_auditor.py --input data.csv --rules rules.json
undefinedpython data_quality_auditor.py --input data.csv --rules rules.json
undefinedAPI Reference
API 参考
DataQualityAuditor Class
DataQualityAuditor 类
python
class DataQualityAuditor:
def __init__(self)
# Data loading
def load_csv(self, filepath: str, **kwargs) -> 'DataQualityAuditor'
def load_dataframe(self, df: pd.DataFrame) -> 'DataQualityAuditor'
# Full audit
def audit(self) -> dict
def quality_score(self) -> float
# Individual checks
def check_missing(self) -> dict
def check_duplicates(self, subset: list = None) -> dict
def check_types(self) -> dict
def check_uniqueness(self) -> dict
def check_patterns(self, column: str, pattern: str) -> dict
# Validation
def validate_column(self, column: str, rules: dict) -> dict
def validate_dataset(self, rules: dict) -> dict
# Reports
def generate_report(self, output: str, format: str = "html") -> str
def summary(self) -> strpython
class DataQualityAuditor:
def __init__(self)
# Data loading
def load_csv(self, filepath: str, **kwargs) -> 'DataQualityAuditor'
def load_dataframe(self, df: pd.DataFrame) -> 'DataQualityAuditor'
# Full audit
def audit(self) -> dict
def quality_score(self) -> float
# Individual checks
def check_missing(self) -> dict
def check_duplicates(self, subset: list = None) -> dict
def check_types(self) -> dict
def check_uniqueness(self) -> dict
def check_patterns(self, column: str, pattern: str) -> dict
# Validation
def validate_column(self, column: str, rules: dict) -> dict
def validate_dataset(self, rules: dict) -> dict
# Reports
def generate_report(self, output: str, format: str = "html") -> str
def summary(self) -> strQuality Checks
质量检查项
Missing Values
缺失值
python
missing = auditor.check_missing()python
missing = auditor.check_missing()Returns:
Returns:
{
"total_cells": 10000,
"missing_cells": 150,
"missing_percent": 1.5,
"by_column": {
"email": {"count": 50, "percent": 5.0},
"phone": {"count": 100, "percent": 10.0}
},
"rows_with_missing": 120
}
undefined{
"total_cells": 10000,
"missing_cells": 150,
"missing_percent": 1.5,
"by_column": {
"email": {"count": 50, "percent": 5.0},
"phone": {"count": 100, "percent": 10.0}
},
"rows_with_missing": 120
}
undefinedDuplicates
重复项
python
dups = auditor.check_duplicates()python
dups = auditor.check_duplicates()Returns:
Returns:
{
"total_rows": 1000,
"duplicate_rows": 25,
"duplicate_percent": 2.5,
"duplicate_groups": [...],
"by_columns": {
"email": {"duplicates": 15},
"phone": {"duplicates": 20}
}
}
undefined{
"total_rows": 1000,
"duplicate_rows": 25,
"duplicate_percent": 2.5,
"duplicate_groups": [...],
"by_columns": {
"email": {"duplicates": 15},
"phone": {"duplicates": 20}
}
}
undefinedType Validation
类型验证
python
types = auditor.check_types()python
types = auditor.check_types()Returns:
Returns:
{
"columns": {
"age": {
"detected_type": "int64",
"unique_values": 75,
"sample_values": [25, 30, 45],
"issues": []
},
"date": {
"detected_type": "object",
"unique_values": 365,
"sample_values": ["2023-01-01", "invalid"],
"issues": ["Mixed date formats detected"]
}
}
}
undefined{
"columns": {
"age": {
"detected_type": "int64",
"unique_values": 75,
"sample_values": [25, 30, 45],
"issues": []
},
"date": {
"detected_type": "object",
"unique_values": 365,
"sample_values": ["2023-01-01", "invalid"],
"issues": ["Mixed date formats detected"]
}
}
}
undefinedValidation Rules
验证规则
Define custom validation rules:
json
{
"columns": {
"email": {
"required": true,
"unique": true,
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
},
"age": {
"type": "integer",
"min": 0,
"max": 120
},
"status": {
"allowed_values": ["active", "inactive", "pending"]
},
"created_at": {
"type": "date",
"format": "%Y-%m-%d"
}
}
}python
results = auditor.validate_dataset(rules)定义自定义验证规则:
json
{
"columns": {
"email": {
"required": true,
"unique": true,
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
},
"age": {
"type": "integer",
"min": 0,
"max": 120
},
"status": {
"allowed_values": ["active", "inactive", "pending"]
},
"created_at": {
"type": "date",
"format": "%Y-%m-%d"
}
}
}python
results = auditor.validate_dataset(rules)Quality Score
质量得分
The quality score (0-100) is calculated from:
- Completeness (30%): Missing value ratio
- Uniqueness (25%): Duplicate row ratio
- Validity (25%): Type and constraint compliance
- Consistency (20%): Format and pattern adherence
python
score = auditor.quality_score()质量得分(0-100)由以下部分计算得出:
- 完整性(30%):缺失值占比
- 唯一性(25%):重复行占比
- 有效性(25%):类型与约束合规性
- 一致性(20%):格式与模式遵循度
python
score = auditor.quality_score()85.5
85.5
undefinedundefinedOutput Formats
输出格式
Audit Report
审核报告
python
{
"file": "data.csv",
"rows": 1000,
"columns": 15,
"quality_score": 85.5,
"completeness": {
"score": 92.0,
"missing_cells": 800,
"details": {...}
},
"uniqueness": {
"score": 97.5,
"duplicate_rows": 25,
"details": {...}
},
"validity": {
"score": 78.0,
"type_issues": [...],
"details": {...}
},
"consistency": {
"score": 80.0,
"pattern_issues": [...],
"details": {...}
},
"recommendations": [
"Column 'phone' has 10% missing values",
"25 duplicate rows detected",
"Column 'date' has inconsistent formats"
]
}python
{
"file": "data.csv",
"rows": 1000,
"columns": 15,
"quality_score": 85.5,
"completeness": {
"score": 92.0,
"missing_cells": 800,
"details": {...}
},
"uniqueness": {
"score": 97.5,
"duplicate_rows": 25,
"details": {...}
},
"validity": {
"score": 78.0,
"type_issues": [...],
"details": {...}
},
"consistency": {
"score": 80.0,
"pattern_issues": [...],
"details": {...}
},
"recommendations": [
"Column 'phone' has 10% missing values",
"25 duplicate rows detected",
"Column 'date' has inconsistent formats"
]
}Example Workflows
示例工作流
Pre-Import Validation
导入前验证
python
auditor = DataQualityAuditor()
auditor.load_csv("import_data.csv")
report = auditor.audit()
if report['quality_score'] < 80:
print("Data quality below threshold!")
for rec in report['recommendations']:
print(f" - {rec}")
exit(1)python
auditor = DataQualityAuditor()
auditor.load_csv("import_data.csv")
report = auditor.audit()
if report['quality_score'] < 80:
print("Data quality below threshold!")
for rec in report['recommendations']:
print(f" - {rec}")
exit(1)ETL Pipeline Check
ETL 管道检查
python
auditor = DataQualityAuditor()
auditor.load_dataframe(transformed_df)python
auditor = DataQualityAuditor()
auditor.load_dataframe(transformed_df)Check critical columns
Check critical columns
email_check = auditor.validate_column("email", {
"required": True,
"unique": True,
"pattern": r"^[\w.+-]+@[\w-]+.[\w.-]+$"
})
if email_check['issues']:
raise ValueError(f"Email validation failed: {email_check['issues']}")
undefinedemail_check = auditor.validate_column("email", {
"required": True,
"unique": True,
"pattern": r"^[\w.+-]+@[\w-]+.[\w.-]+$"
})
if email_check['issues']:
raise ValueError(f"Email validation failed: {email_check['issues']}")
undefinedGenerate Documentation
生成文档
python
auditor = DataQualityAuditor()
auditor.load_csv("dataset.csv")python
auditor = DataQualityAuditor()
auditor.load_csv("dataset.csv")Generate comprehensive report
Generate comprehensive report
auditor.generate_report("quality_report.html", format="html")
auditor.generate_report("quality_report.html", format="html")
Or get summary text
Or get summary text
print(auditor.summary())
undefinedprint(auditor.summary())
undefinedDependencies
依赖项
- pandas>=2.0.0
- numpy>=1.24.0
- pandas>=2.0.0
- numpy>=1.24.0