Loading...
Loading...
Cuantización de modelos ML a FP16/INT8 para reducir memoria y acelerar inferencia en el pipeline KYC
npx skill4agent add davidcastagnetoa/skills fp16_int8_quantizationfrom modules.face_match.evaluator import evaluate_model
baseline_metrics = evaluate_model("models/arcface_fp32.onnx", test_dataset)
print(f"FAR: {baseline_metrics['far']}, FRR: {baseline_metrics['frr']}, Latency: {baseline_metrics['latency_ms']}ms")from onnxruntime.transformers import float16
import onnx
model = onnx.load("models/arcface_fp32.onnx")
model_fp16 = float16.convert_float_to_float16(model, keep_io_types=True)
onnx.save(model_fp16, "models/arcface_fp16.onnx")class KYCCalibrationDataReader(CalibrationDataReader):
def __init__(self, calibration_images_dir: str):
self.data = self._load_and_preprocess(calibration_images_dir)
self.iter = iter(self.data)
def get_next(self):
return next(self.iter, None)from onnxruntime.quantization import quantize_static, QuantType
quantize_static(
model_input="models/arcface_fp32.onnx",
model_output="models/arcface_int8.onnx",
calibration_data_reader=KYCCalibrationDataReader("data/calibration/faces/"),
quant_format=QuantFormat.QDQ,
weight_type=QuantType.QInt8,
activation_type=QuantType.QInt8
)quantized_metrics = evaluate_model("models/arcface_int8.onnx", test_dataset)
degradation = baseline_metrics['far'] - quantized_metrics['far']
assert degradation < 0.01, "Degradación de FAR inaceptable tras cuantización"quantize_static(
model_input="models/arcface_fp32.onnx",
model_output="models/arcface_mixed.onnx",
calibration_data_reader=calibration_reader,
nodes_to_exclude=["first_conv", "final_fc", "bn_final"]
)