Loading...
Loading...
A Pythonic interface to the HDF5 binary data format. It allows you to store huge amounts of numerical data and easily manipulate that data from NumPy. Features a hierarchical structure similar to a file system. Use for storing datasets larger than RAM, organizing complex scientific data hierarchically, storing numerical arrays with high-speed random access, keeping metadata attached to data, sharing data between languages, and reading/writing large datasets in chunks.
npx skill4agent add tondevrel/scientific-agent-skills h5pyh5py.Filecreate_dataseth5py.Groupchunks=Truecompression="gzip"pip install h5pyimport h5py
import numpy as npimport h5py
import numpy as np
# Writing data
with h5py.File('data.h5', 'w') as f:
dset = f.create_dataset('main_data', data=np.random.rand(100, 100))
dset.attrs['units'] = 'meters'
grp = f.create_group('subgroup')
grp.create_dataset('results', data=[1, 2, 3])
# Reading data
with h5py.File('data.h5', 'r') as f:
data_slice = f['main_data'][0:10, 0:10] # Only read 100 elements
units = f['main_data'].attrs['units']
print(f"Group content: {list(f['subgroup'].keys())}")with h5py.File(...) as f:chunks=Truecompression="gzip"/experiment1/sensorA/raw"name" in groupdata = f['large_dataset'][:]import h5py
import numpy as np
# ❌ BAD: Manual file closing (unsafe)
f = h5py.File('data.h5', 'w')
f.create_dataset('x', data=np.arange(10))
f.close() # If an error happened above, this never runs!
# ✅ GOOD: Context manager
with h5py.File('data.h5', 'w') as f:
f.create_dataset('x', data=np.arange(10))
# ❌ BAD: Storing metadata as strings inside a dataset
f.create_dataset('meta', data=np.array(['unit: meter', 'date: 2024']))
# ✅ GOOD: Using Attributes
dset = f.create_dataset('data', data=np.random.rand(10))
dset.attrs['unit'] = 'meter'
dset.attrs['date'] = '2024'
# ❌ BAD: Inefficient chunking (one row at a time when you read columns)
# f.create_dataset('big', shape=(10000, 10000), chunks=(1, 10000))with h5py.File('optimized.h5', 'w') as f:
# 1. Resizable dataset (maxshape)
dset = f.create_dataset('growing',
shape=(100,),
maxshape=(None,), # Allow growth in 1st dimension
dtype='float32')
# 2. Compression and Chunking
f.create_dataset('compressed',
data=np.random.randn(1000, 1000),
chunks=(100, 100),
compression="gzip",
compression_opts=4) # 4 is a good balance
# 3. Filling with default values
f.create_dataset('default', shape=(10, 10), fillvalue=-1.0)with h5py.File('nested.h5', 'w') as f:
f.create_group('raw/2024/january')
f.create_group('raw/2024/february')
# Recursive iteration
def print_structure(name, obj):
print(name)
with h5py.File('nested.h5', 'r') as f:
f.visititems(print_structure) # Visits every dataset and group
# Accessing via path
feb_data = f['/raw/2024/february']chunks=(1, n_cols)chunks=(100, 100)chunks=True# Writer
f = h5py.File('live.h5', 'w', libver='latest')
f.swmr_mode = True
# Reader
f = h5py.File('live.h5', 'r', libver='latest', swmr=True)# Create an HDF5 file in memory
f = h5py.File('memfile.h5', 'w', driver='core', backing_store=True)def save_ml_dataset(X, y, filename):
with h5py.File(filename, 'w') as f:
# Create datasets for images and labels
f.create_dataset('images', data=X, compression="lzf") # LZF is fast
f.create_dataset('labels', data=y)
# Add metadata
f.attrs['n_samples'] = X.shape[0]
f.attrs['input_shape'] = X.shape[1:]
f.attrs['classes'] = np.unique(y)
# Use cases: training on data that exceeds RAMdef log_simulation_step(filename, step_idx, data_array):
with h5py.File(filename, 'a') as f:
if 'simulation' not in f:
# Initialize resizable dataset
f.create_dataset('simulation',
shape=(0, *data_array.shape),
maxshape=(None, *data_array.shape),
chunks=(1, *data_array.shape))
dset = f['simulation']
dset.resize(step_idx + 1, axis=0)
dset[step_idx] = data_arraydef store_images(image_files, h5_file):
with h5py.File(h5_file, 'w') as f:
grp = f.create_group('microscopy_data')
for i, img_path in enumerate(image_files):
# Load your image here
img_data = np.random.rand(512, 512)
dset = grp.create_dataset(f'img_{i:04d}', data=img_data)
dset.attrs['original_path'] = img_path# ❌ Problem: f.create_dataset('x', ...) fails if 'x' exists
# ✅ Solution: Delete first or use a check
if 'x' in f:
del f['x']
f.create_dataset('x', data=new_data)# ❌ Problem: "OSError: Unable to open file (file locking disabled on this file system)"
# This often happens on network drives (NFS).
# ✅ Solution: Set environment variable before running script
import os
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
import h5py# ❌ Problem: Storing lists of strings can sometimes cause issues in older versions
# ✅ Solution: Use special string types
dt = h5py.string_dtype(encoding='utf-8')
dset = f.create_dataset('strings', (100,), dtype=dt)
dset[0] = "Научные данные"