Loading...
Loading...
Data validation and pipeline testing utilities for ML training projects. Validates datasets, model checkpoints, training pipelines, and dependencies. Use when validating training data, checking model outputs, testing ML pipelines, verifying dependencies, debugging training failures, or ensuring data quality before training.
npx skill4agent add vanman2024/ai-dev-marketplace validation-scriptsscripts/validate-data.shscripts/validate-model.shscripts/test-pipeline.shscripts/check-dependencies.shtemplates/test-config.yamltemplates/validation-schema.jsonexamples/data-validation-example.mdexamples/pipeline-testing-example.mdbash scripts/validate-data.sh \
--data-path ./data/train.jsonl \
--format jsonl \
--schema templates/validation-schema.jsonbash scripts/validate-model.sh \
--model-path ./checkpoints/epoch-3 \
--framework pytorch \
--check-weightsbash scripts/test-pipeline.sh \
--config templates/test-config.yaml \
--data ./data/sample.jsonl \
--verbosebash scripts/check-dependencies.sh \
--framework pytorch \
--gpu-required \
--min-vram 16bash scripts/validate-data.sh [OPTIONS]--data-path PATH--format FORMAT--schema PATH--sample-size N--check-duplicates--check-null--check-length--check-tokens--tokenizer MODEL--max-length N--output REPORT{
"status": "PASS",
"total_samples": 10000,
"valid_samples": 9987,
"invalid_samples": 13,
"validation_errors": [
{
"sample_id": 42,
"field": "text",
"error": "Exceeds max token length: 2150 > 2048"
},
{
"sample_id": 156,
"field": "label",
"error": "Invalid label value: 'unknwn' (typo)"
}
],
"statistics": {
"avg_text_length": 487,
"avg_token_count": 128,
"label_distribution": {
"positive": 4892,
"negative": 4895,
"neutral": 213
}
},
"recommendations": [
"Remove or fix 13 invalid samples before training",
"Label distribution is imbalanced - consider class weighting"
]
}012bash scripts/validate-model.sh [OPTIONS]--model-path PATH--framework FRAMEWORK--check-weights--check-config--check-tokenizer--check-inference--sample-input TEXT--expected-output TEXT--output REPORT{
"status": "PASS",
"model_path": "./checkpoints/llama-7b-finetuned",
"framework": "pytorch",
"checks": {
"file_structure": "PASS",
"config": "PASS",
"weights": "PASS",
"tokenizer": "PASS",
"inference": "PASS"
},
"model_info": {
"architecture": "LlamaForCausalLM",
"parameters": "7.2B",
"precision": "float16",
"lora_enabled": true,
"lora_rank": 16
},
"memory_estimate": {
"model_size_gb": 13.5,
"inference_vram_gb": 16.2,
"training_vram_gb": 24.8
},
"inference_test": {
"input": "Hello, world!",
"output": "Hello, world! How can I help you today?",
"latency_ms": 142
}
}bash scripts/test-pipeline.sh [OPTIONS]--config PATH--data PATH--steps STEPS--verbose--output REPORT--fail-fast--cleanuppipeline:
name: llama-7b-finetuning
framework: pytorch
data:
train_path: ./data/sample-train.jsonl
val_path: ./data/sample-val.jsonl
format: jsonl
model:
base_model: meta-llama/Llama-2-7b-hf
checkpoint_path: ./checkpoints/test
load_in_8bit: false
lora:
enabled: true
r: 16
alpha: 32
training:
batch_size: 1
gradient_accumulation: 1
learning_rate: 2e-4
max_steps: 5
testing:
sample_size: 10
timeout_seconds: 300
expected_loss_range: [0.5, 3.0]Pipeline Test Report
====================
Pipeline: llama-7b-finetuning
Started: 2025-11-01 12:34:56
Duration: 127 seconds
Test Results:
✓ data_loading (2.3s) - Loaded 10 samples successfully
✓ tokenization (1.1s) - Tokenized all samples, avg length: 128 tokens
✓ model_loading (8.7s) - Model loaded, 7.2B parameters
✓ training_step (15.4s) - Training step completed, loss: 1.847
✓ validation_step (12.1s) - Validation step completed, loss: 1.923
✓ checkpoint_save (3.2s) - Checkpoint saved to ./checkpoints/test
✓ checkpoint_load (6.8s) - Checkpoint loaded successfully
✓ inference (2.9s) - Inference completed, latency: 142ms
✓ metrics (0.4s) - Metrics calculated correctly
Overall: PASS (8/8 tests passed)
Performance Metrics:
- Total time: 127s
- GPU memory used: 15.2 GB
- CPU memory used: 8.4 GB
Recommendations:
- Pipeline is ready for full training
- Consider increasing batch_size to improve throughputbash scripts/check-dependencies.sh [OPTIONS]--framework FRAMEWORK--gpu-required--min-vram GB--cuda-version VERSION--packages FILE--platform PLATFORM--fix--output REPORT{
"status": "PASS",
"platform": "modal",
"checks": {
"python": {
"status": "PASS",
"version": "3.10.12",
"required": ">=3.9"
},
"pytorch": {
"status": "PASS",
"version": "2.1.0",
"cuda_available": true,
"cuda_version": "12.1"
},
"gpu": {
"status": "PASS",
"count": 1,
"type": "NVIDIA A100",
"vram_gb": 40,
"driver_version": "535.129.03"
},
"packages": {
"status": "PASS",
"installed": 42,
"missing": 0,
"outdated": 3
},
"storage": {
"status": "PASS",
"available_gb": 128,
"required_gb": 50
}
},
"recommendations": [
"Update transformers to latest version (4.36.0 -> 4.37.2)",
"Consider upgrading to PyTorch 2.2.0 for better performance"
]
}{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": ["text", "label"],
"properties": {
"text": {
"type": "string",
"minLength": 10,
"maxLength": 5000,
"description": "Input text for training"
},
"label": {
"type": "string",
"enum": ["positive", "negative", "neutral"],
"description": "Classification label"
},
"metadata": {
"type": "object",
"properties": {
"source": {
"type": "string"
},
"timestamp": {
"type": "string",
"format": "date-time"
}
}
}
},
"validation_rules": {
"max_token_length": 2048,
"tokenizer": "meta-llama/Llama-2-7b-hf",
"check_duplicates": true,
"min_label_count": 100,
"max_label_imbalance_ratio": 10.0
}
}requiredpipeline:
name: test-pipeline
framework: pytorch
platform: modal # modal, lambda, runpod, local
data:
train_path: ./data/sample-train.jsonl
val_path: ./data/sample-val.jsonl
format: jsonl # jsonl, csv, parquet
sample_size: 10 # Number of samples to use for testing
model:
base_model: meta-llama/Llama-2-7b-hf
checkpoint_path: ./checkpoints/test
quantization: null # null, 8bit, 4bit
lora:
enabled: true
r: 16
alpha: 32
dropout: 0.05
target_modules: ["q_proj", "v_proj"]
training:
batch_size: 1
gradient_accumulation: 1
learning_rate: 2e-4
max_steps: 5
warmup_steps: 0
eval_steps: 2
testing:
sample_size: 10
timeout_seconds: 300
expected_loss_range: [0.5, 3.0]
fail_fast: true
cleanup: true
validation:
metrics: ["loss", "perplexity"]
check_gradients: true
check_memory_leak: true
gpu:
required: true
min_vram_gb: 16
allow_cpu_fallback: false# 1. Check system dependencies
bash scripts/check-dependencies.sh \
--framework pytorch \
--gpu-required \
--min-vram 16
# 2. Validate training data
bash scripts/validate-data.sh \
--data-path ./data/train.jsonl \
--schema templates/validation-schema.json \
--check-duplicates \
--check-tokens
# 3. Test pipeline with sample data
bash scripts/test-pipeline.sh \
--config templates/test-config.yaml \
--verbose# 1. Validate model checkpoint
bash scripts/validate-model.sh \
--model-path ./checkpoints/final \
--framework pytorch \
--check-weights \
--check-inference
# 2. Test inference pipeline
bash scripts/test-pipeline.sh \
--config templates/test-config.yaml \
--steps inference,metrics# .github/workflows/validate-training.yml
- name: Validate Training Data
run: |
bash plugins/ml-training/skills/validation-scripts/scripts/validate-data.sh \
--data-path ./data/train.jsonl \
--schema ./validation-schema.json
- name: Test Training Pipeline
run: |
bash plugins/ml-training/skills/validation-scripts/scripts/test-pipeline.sh \
--config ./test-config.yaml \
--fail-fastbash scripts/validate-data.sh --data-path ./data/train.jsonl --verbose --debug--sample-size--fail-fast# Ensure you're in the correct directory
cd /path/to/ml-training/skills/validation-scripts
bash scripts/validate-data.sh --help# Make scripts executable
chmod +x scripts/*.sh# Install required tools
pip install jsonschema pandas pyarrow
sudo apt-get install jq bc