Loading...
Loading...
Reading and writing data with Pandas from/to cloud storage (S3, GCS, Azure) using fsspec and PyArrow filesystems.
npx skill4agent add legout/data-platform-agent-skills data-engineering-storage-remote-access-integrations-pandasimport pandas as pd
# Read CSV/Parquet directly from cloud URIs
df = pd.read_csv("s3://bucket/data.csv")
df = pd.read_parquet("s3://bucket/data.parquet")
df = pd.read_json("gs://bucket/data.json")
# Compression is auto-detected
df = pd.read_csv("s3://bucket/data.csv.gz") # Automatically decompressedimport fsspec
import pandas as pd
# Create fsspec filesystem with configuration
fs = fsspec.filesystem("s3", anon=False) # Uses default credentials chain
# Open file through filesystem
with fs.open("s3://bucket/data.csv") as f:
df = pd.read_csv(f)
# Or pass filesystem directly (recommended for performance)
df = pd.read_parquet(
"s3://bucket/data.parquet",
filesystem=fs,
columns=["id", "value"], # Column pruning reduces data transfer
filters=[("date", ">=", "2024-01-01")] # Row group filtering
)import pyarrow.fs as fs
import pandas as pd
s3_fs = fs.S3FileSystem(region="us-east-1")
# Read with column filtering
df = pd.read_parquet(
"bucket/data.parquet", # Note: no s3:// prefix when using filesystem
filesystem=s3_fs,
columns=["id", "name", "value"]
)
# Write to cloud storage
df.to_parquet(
"s3://bucket/output/",
filesystem=s3_fs,
partition_cols=["year", "month"] # Partitioned write
)import pandas as pd
df = pd.DataFrame({
"id": [1, 2, 3],
"year": [2024, 2024, 2023],
"month": [1, 2, 12],
"value": [100.0, 200.0, 150.0]
})
# Using fsspec
fs = fsspec.filesystem("s3")
df.to_parquet(
"s3://bucket/output/",
partition_cols=["year", "month"],
filesystem=fs
)
# Output structure: s3://bucket/output/year=2024/month=1/part-0.parquetkey=secret=fsspec.filesystem()fs = fsspec.filesystem("s3", client_kwargs={
"endpoint_url": "http://minio.local:9000"
})@data-engineering-storage-authenticationpd.read_parquet(columns=[...])filters=simplecache::filecache::cached_fs = fsspec.filesystem("simplecache", target_protocol="s3")
df = pd.read_parquet("simplecache::s3://bucket/data.parquet", filesystem=cached_fs)@data-engineering-corefs.glob()@data-engineering-core@data-engineering-storage-remote-access/libraries/fsspec