init
This commit is contained in:
commit
1fd8a68383
10 changed files with 1118 additions and 0 deletions
30
schema_polars.py
Normal file
30
schema_polars.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
import polars as pl
|
||||
import s3fs
|
||||
|
||||
# MinIO S3 endpoint
|
||||
s3_endpoint_url = "https://minio.wayl.one" # Use http:// if MinIO is not using SSL
|
||||
|
||||
# Create S3 filesystem instance
|
||||
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_endpoint_url})
|
||||
fs.ls("duckdb-playground")
|
||||
|
||||
# S3 file path
|
||||
s3_path = "s3://duckdb-playground/hotel_bookings.csv"
|
||||
|
||||
# Define schema overrides (make 'children' nullable)
|
||||
schema_overrides = {"children": pl.Int64}
|
||||
|
||||
# Open the file with s3fs and scan lazily
|
||||
with fs.open(s3_path, mode="rb") as f:
|
||||
df = pl.scan_csv(
|
||||
f,
|
||||
infer_schema_length=10000, # Increase schema inference length
|
||||
schema_overrides=schema_overrides, # Override 'children' type
|
||||
null_values=["NA"], # Treat "NA" as null
|
||||
)
|
||||
|
||||
# Fetch and show first few rows
|
||||
print(df.fetch(5))
|
||||
|
||||
row_count = df.select(pl.len()).collect()
|
||||
print(row_count)
|
||||
Loading…
Add table
Add a link
Reference in a new issue