30 lines
860 B
Python
30 lines
860 B
Python
import polars as pl
|
|
import s3fs
|
|
|
|
# MinIO S3 endpoint
|
|
s3_endpoint_url = "https://minio.wayl.one" # Use http:// if MinIO is not using SSL
|
|
|
|
# Create S3 filesystem instance
|
|
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_endpoint_url})
|
|
fs.ls("duckdb-playground")
|
|
|
|
# S3 file path
|
|
s3_path = "s3://duckdb-playground/hotel_bookings.csv"
|
|
|
|
# Define schema overrides (make 'children' nullable)
|
|
schema_overrides = {"children": pl.Int64}
|
|
|
|
# Open the file with s3fs and scan lazily
|
|
with fs.open(s3_path, mode="rb") as f:
|
|
df = pl.scan_csv(
|
|
f,
|
|
infer_schema_length=10000, # Increase schema inference length
|
|
schema_overrides=schema_overrides, # Override 'children' type
|
|
null_values=["NA"], # Treat "NA" as null
|
|
)
|
|
|
|
# Fetch and show first few rows
|
|
print(df.fetch(5))
|
|
|
|
row_count = df.select(pl.len()).collect()
|
|
print(row_count)
|