duckdb-playground/schema_polars.py
Waylon S. Walker 1fd8a68383 init
2025-03-15 20:27:48 -05:00

30 lines
860 B
Python

import polars as pl
import s3fs
# MinIO S3 endpoint
s3_endpoint_url = "https://minio.wayl.one" # Use http:// if MinIO is not using SSL
# Create S3 filesystem instance
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_endpoint_url})
fs.ls("duckdb-playground")
# S3 file path
s3_path = "s3://duckdb-playground/hotel_bookings.csv"
# Define schema overrides (make 'children' nullable)
schema_overrides = {"children": pl.Int64}
# Open the file with s3fs and scan lazily
with fs.open(s3_path, mode="rb") as f:
df = pl.scan_csv(
f,
infer_schema_length=10000, # Increase schema inference length
schema_overrides=schema_overrides, # Override 'children' type
null_values=["NA"], # Treat "NA" as null
)
# Fetch and show first few rows
print(df.fetch(5))
row_count = df.select(pl.len()).collect()
print(row_count)